From 6c2ead30f0526226d87c9287b956584f867dd2a5 Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Fri, 27 Jun 2014 12:05:18 -0700 Subject: Remove all trailing whitespace except lapack-netlib Signed-off-by: Timothy Gu --- .travis.yml | 2 +- CONTRIBUTORS.md | 8 +- Changelog.txt | 70 +- GotoBLAS_01Readme.txt | 2 +- GotoBLAS_02QuickInstall.txt | 6 +- GotoBLAS_03FAQ.txt | 2 +- GotoBLAS_05LargePage.txt | 2 +- GotoBLAS_06WeirdPerformance.txt | 4 +- LICENSE | 22 +- Makefile | 10 +- Makefile.alpha | 2 +- Makefile.ia64 | 2 +- Makefile.install | 8 +- Makefile.power | 8 +- Makefile.rule | 18 +- Makefile.sparc | 2 +- Makefile.system | 36 +- Makefile.tail | 14 +- Makefile.x86 | 4 +- Makefile.x86_64 | 2 +- README.md | 14 +- benchmark/cholesky.c | 32 +- benchmark/linpack.c | 32 +- c_check | 18 +- cblas.h | 6 +- cblas_noconst.h | 6 +- common.h | 6 +- common_arm.h | 26 +- common_arm64.h | 26 +- common_ia64.h | 4 +- common_interface.h | 124 +- common_level1.h | 18 +- common_level2.h | 16 +- common_level3.h | 8 +- common_linux.h | 6 +- common_mips64.h | 46 +- common_param.h | 76 +- common_power.h | 2 +- common_reference.h | 24 +- common_sparc.h | 26 +- common_thread.h | 6 +- common_x86.h | 12 +- common_x86_64.h | 12 +- cpuid.S | 4 +- cpuid_alpha.c | 8 +- cpuid_arm.c | 8 +- cpuid_ia64.c | 6 +- cpuid_mips.c | 22 +- cpuid_power.c | 2 +- cpuid_x86.c | 68 +- ctest/Makefile | 16 +- ctest/c_c2chke.c | 292 ++-- ctest/c_c3chke.c | 12 +- ctest/c_cblas1.c | 6 +- ctest/c_cblas2.c | 40 +- ctest/c_cblas3.c | 60 +- ctest/c_cblat2.f | 14 +- ctest/c_cblat3.f | 78 +- ctest/c_d2chke.c | 290 ++-- ctest/c_d3chke.c | 14 +- ctest/c_dblas1.c | 2 +- ctest/c_dblas2.c | 58 +- ctest/c_dblas3.c | 36 +- ctest/c_dblat1.f | 4 +- ctest/c_dblat2.f | 14 +- ctest/c_dblat3.f | 64 +- ctest/c_s2chke.c | 290 ++-- ctest/c_s3chke.c | 14 +- ctest/c_sblas1.c | 4 +- ctest/c_sblas2.c | 58 +- ctest/c_sblas3.c | 36 +- ctest/c_sblat2.f | 14 +- ctest/c_sblat3.f | 66 +- ctest/c_xerbla.c | 20 +- ctest/c_z2chke.c | 292 ++-- ctest/c_z3chke.c | 12 +- ctest/c_zblas1.c | 6 +- ctest/c_zblas2.c | 40 +- ctest/c_zblas3.c | 60 +- ctest/c_zblat2.f | 26 +- ctest/c_zblat3.f | 80 +- driver/level2/Makefile | 2130 ++++++++++++------------ driver/level2/gbmv_k.c | 4 +- driver/level2/gbmv_thread.c | 32 +- driver/level2/gemv_thread.c | 22 +- driver/level2/ger_thread.c | 28 +- driver/level2/sbmv_k.c | 6 +- driver/level2/sbmv_thread.c | 80 +- driver/level2/spmv_k.c | 2 +- driver/level2/spmv_thread.c | 78 +- driver/level2/spr2_k.c | 2 +- driver/level2/spr2_thread.c | 64 +- driver/level2/spr_k.c | 2 +- driver/level2/spr_thread.c | 56 +- driver/level2/symv_thread.c | 76 +- driver/level2/syr2_k.c | 2 +- driver/level2/syr2_thread.c | 62 +- driver/level2/syr_k.c | 2 +- driver/level2/syr_thread.c | 56 +- driver/level2/tbmv_L.c | 8 +- driver/level2/tbmv_U.c | 6 +- driver/level2/tbmv_thread.c | 96 +- driver/level2/tbsv_L.c | 6 +- driver/level2/tbsv_U.c | 8 +- driver/level2/tpmv_L.c | 6 +- driver/level2/tpmv_U.c | 6 +- driver/level2/tpmv_thread.c | 88 +- driver/level2/tpsv_L.c | 8 +- driver/level2/tpsv_U.c | 6 +- driver/level2/trmv_L.c | 10 +- driver/level2/trmv_U.c | 8 +- driver/level2/trmv_thread.c | 98 +- driver/level2/trsv_L.c | 14 +- driver/level2/trsv_U.c | 10 +- driver/level2/zgbmv_k.c | 2 +- driver/level2/zhbmv_k.c | 18 +- driver/level2/zher2_k.c | 6 +- driver/level2/zhpmv_k.c | 32 +- driver/level2/zhpr2_k.c | 6 +- driver/level2/zsbmv_k.c | 12 +- driver/level2/zspmv_k.c | 16 +- driver/level2/zspr2_k.c | 4 +- driver/level2/zspr_k.c | 2 +- driver/level2/zsyr2_k.c | 4 +- driver/level2/zsyr_k.c | 2 +- driver/level2/ztbmv_L.c | 6 +- driver/level2/ztbmv_U.c | 2 +- driver/level2/ztbsv_L.c | 10 +- driver/level2/ztbsv_U.c | 12 +- driver/level2/ztpmv_L.c | 8 +- driver/level2/ztpmv_U.c | 4 +- driver/level2/ztpsv_L.c | 12 +- driver/level2/ztpsv_U.c | 14 +- driver/level2/ztrmv_L.c | 2 +- driver/level2/ztrmv_U.c | 6 +- driver/level2/ztrsv_L.c | 8 +- driver/level2/ztrsv_U.c | 6 +- driver/level3/Makefile | 216 +-- driver/level3/gemm3m_level3.c | 130 +- driver/level3/gemm_thread_m.c | 8 +- driver/level3/gemm_thread_mn.c | 12 +- driver/level3/gemm_thread_n.c | 10 +- driver/level3/gemm_thread_variable.c | 8 +- driver/level3/level3.c | 48 +- driver/level3/level3_gemm3m_thread.c | 280 ++-- driver/level3/level3_syr2k.c | 114 +- driver/level3/level3_syrk.c | 202 +-- driver/level3/level3_syrk_threaded.c | 142 +- driver/level3/level3_thread.c | 132 +- driver/level3/syr2k_k.c | 2 +- driver/level3/syr2k_kernel.c | 42 +- driver/level3/syrk_k.c | 2 +- driver/level3/syrk_kernel.c | 24 +- driver/level3/syrk_thread.c | 52 +- driver/level3/trmm_L.c | 94 +- driver/level3/trmm_R.c | 102 +- driver/level3/trsm_L.c | 40 +- driver/level3/trsm_R.c | 104 +- driver/level3/zher2k_k.c | 2 +- driver/level3/zher2k_kernel.c | 38 +- driver/level3/zherk_k.c | 2 +- driver/level3/zherk_kernel.c | 24 +- driver/level3/zsyrk_beta.c | 2 +- driver/mapper/mapper.c | 40 +- driver/others/Makefile | 12 +- driver/others/blas_l1_thread.c | 16 +- driver/others/blas_server.c | 216 +-- driver/others/blas_server_omp.c | 36 +- driver/others/blas_server_win32.c | 122 +- driver/others/divtable.c | 34 +- driver/others/dynamic.c | 32 +- driver/others/init.c | 96 +- driver/others/lamc3.c | 2 +- driver/others/lamch.c | 2 +- driver/others/memory.c | 214 +-- driver/others/memory_qalloc.c | 6 +- driver/others/openblas_get_config.c | 22 +- driver/others/openblas_get_parallel.c | 34 +- driver/others/openblas_set_num_threads.c | 22 +- driver/others/parameter.c | 42 +- driver/others/profile.c | 12 +- driver/others/xerbla.c | 4 +- exports/Makefile | 4 +- exports/dllinit.c | 2 +- exports/gensymbol | 72 +- f_check | 44 +- ftest.f | 2 +- ftest3.f | 2 +- getarch.c | 30 +- getarch_2nd.c | 6 +- interface/Makefile | 24 +- interface/asum.c | 4 +- interface/axpby.c | 4 +- interface/axpy.c | 14 +- interface/copy.c | 6 +- interface/dot.c | 4 +- interface/dsdot.c | 8 +- interface/gbmv.c | 10 +- interface/gemm.c | 74 +- interface/gemv.c | 20 +- interface/ger.c | 6 +- interface/imatcopy.c | 2 +- interface/imax.c | 4 +- interface/lapack/gesv.c | 8 +- interface/lapack/getf2.c | 4 +- interface/lapack/getrf.c | 4 +- interface/lapack/getrs.c | 4 +- interface/lapack/larf.c.obsolete | 4 +- interface/lapack/laswp.c | 10 +- interface/lapack/lauu2.c | 4 +- interface/lapack/lauum.c | 6 +- interface/lapack/potf2.c | 4 +- interface/lapack/potrf.c | 4 +- interface/lapack/potri.c | 8 +- interface/lapack/trti2.c | 8 +- interface/lapack/trtri.c | 10 +- interface/lapack/zgetf2.c | 2 +- interface/lapack/zgetrf.c | 2 +- interface/lapack/zgetrs.c | 6 +- interface/lapack/zlaswp.c | 4 +- interface/lapack/zlauu2.c | 4 +- interface/lapack/zlauum.c | 6 +- interface/lapack/zpotf2.c | 4 +- interface/lapack/zpotrf.c | 6 +- interface/lapack/zpotri.c | 8 +- interface/lapack/ztrti2.c | 8 +- interface/lapack/ztrtri.c | 12 +- interface/max.c | 4 +- interface/nrm2.c | 4 +- interface/omatcopy.c | 2 +- interface/rot.c | 4 +- interface/rotm.c | 6 +- interface/rotmg.c | 12 +- interface/sbmv.c | 8 +- interface/scal.c | 14 +- interface/sdsdot.c | 8 +- interface/spmv.c | 6 +- interface/spr.c | 8 +- interface/spr2.c | 8 +- interface/swap.c | 14 +- interface/symm.c | 74 +- interface/symv.c | 10 +- interface/syr.c | 8 +- interface/syr2.c | 8 +- interface/syr2k.c | 44 +- interface/syrk.c | 50 +- interface/tbmv.c | 12 +- interface/tbsv.c | 12 +- interface/tpmv.c | 12 +- interface/tpsv.c | 10 +- interface/trmv.c | 16 +- interface/trsm.c | 50 +- interface/trsv.c | 10 +- interface/zaxpby.c | 6 +- interface/zaxpy.c | 14 +- interface/zgbmv.c | 6 +- interface/zgemv.c | 22 +- interface/zger.c | 6 +- interface/zhbmv.c | 8 +- interface/zhemv.c | 6 +- interface/zher.c | 10 +- interface/zher2.c | 8 +- interface/zhpmv.c | 4 +- interface/zhpr.c | 10 +- interface/zhpr2.c | 10 +- interface/zimatcopy.c | 2 +- interface/zomatcopy.c | 2 +- interface/zrot.c | 4 +- interface/zsbmv.c | 8 +- interface/zscal.c | 8 +- interface/zspmv.c | 4 +- interface/zspr.c | 10 +- interface/zspr2.c | 12 +- interface/zswap.c | 8 +- interface/zsymv.c | 12 +- interface/zsyr.c | 8 +- interface/zsyr2.c | 6 +- interface/ztbmv.c | 12 +- interface/ztbsv.c | 12 +- interface/ztpmv.c | 10 +- interface/ztpsv.c | 10 +- interface/ztrmv.c | 12 +- interface/ztrsv.c | 12 +- kernel/Makefile | 6 +- kernel/Makefile.L1 | 200 +-- kernel/Makefile.L2 | 142 +- kernel/Makefile.L3 | 8 +- kernel/alpha/KERNEL | 4 +- kernel/alpha/cnrm2.S | 4 +- kernel/alpha/dnrm2.S | 4 +- kernel/alpha/gemm_kernel_4x4.S | 6 +- kernel/alpha/gemv_n.S | 8 +- kernel/alpha/iamax.S | 4 +- kernel/alpha/imax.S | 2 +- kernel/alpha/izamax.S | 2 +- kernel/alpha/snrm2.S | 4 +- kernel/alpha/trsm_kernel_4x4_LN.S | 120 +- kernel/alpha/trsm_kernel_4x4_LT.S | 120 +- kernel/alpha/trsm_kernel_4x4_RT.S | 120 +- kernel/alpha/zamax.S | 2 +- kernel/alpha/zaxpy.S | 30 +- kernel/alpha/zgemm_kernel_2x2.S | 2 +- kernel/alpha/znrm2.S | 4 +- kernel/alpha/ztrsm_kernel_2x2_LN.S | 24 +- kernel/alpha/ztrsm_kernel_2x2_LT.S | 24 +- kernel/alpha/ztrsm_kernel_2x2_RT.S | 24 +- kernel/arm/KERNEL.ARMV5 | 4 +- kernel/arm/KERNEL.ARMV6 | 6 +- kernel/arm/KERNEL.ARMV7 | 24 +- kernel/arm/amax.c | 4 +- kernel/arm/amin.c | 4 +- kernel/arm/asum.c | 2 +- kernel/arm/axpby.c | 2 +- kernel/arm/axpy.c | 2 +- kernel/arm/ccopy_vfp.S | 2 +- kernel/arm/cdot_vfp.S | 10 +- kernel/arm/cgemm_kernel_2x2_vfp.S | 28 +- kernel/arm/cgemm_kernel_2x2_vfpv3.S | 28 +- kernel/arm/cgemm_ncopy_2_vfp.S | 14 +- kernel/arm/cgemv_n_vfp.S | 4 +- kernel/arm/copy.c | 2 +- kernel/arm/ctrmm_kernel_2x2_vfp.S | 28 +- kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 28 +- kernel/arm/dcopy_vfp.S | 2 +- kernel/arm/ddot_vfp.S | 2 +- kernel/arm/dgemm_kernel_4x2_vfp.S | 50 +- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 74 +- kernel/arm/dgemm_ncopy_2_vfp.S | 14 +- kernel/arm/dgemm_ncopy_4_vfp.S | 20 +- kernel/arm/dgemm_tcopy_4_vfp.S | 16 +- kernel/arm/dot.c | 2 +- kernel/arm/dtrmm_kernel_4x2_vfp.S | 42 +- kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 74 +- kernel/arm/gemv_n.c | 2 +- kernel/arm/gemv_n_vfp.S | 4 +- kernel/arm/gemv_n_vfpv3.S | 4 +- kernel/arm/gemv_t.c | 2 +- kernel/arm/iamax.c | 4 +- kernel/arm/iamax_vfp.S | 2 +- kernel/arm/iamin.c | 4 +- kernel/arm/imax.c | 4 +- kernel/arm/imin.c | 8 +- kernel/arm/izamax.c | 4 +- kernel/arm/izamin.c | 4 +- kernel/arm/max.c | 4 +- kernel/arm/min.c | 4 +- kernel/arm/nrm2.c | 6 +- kernel/arm/nrm2_vfp.S | 62 +- kernel/arm/nrm2_vfpv3.S | 56 +- kernel/arm/omatcopy_cn.c | 16 +- kernel/arm/omatcopy_ct.c | 16 +- kernel/arm/omatcopy_rn.c | 16 +- kernel/arm/omatcopy_rt.c | 8 +- kernel/arm/rot.c | 2 +- kernel/arm/rot_vfp.S | 20 +- kernel/arm/scal.c | 2 +- kernel/arm/scal_vfp.S | 48 +- kernel/arm/scopy_vfp.S | 2 +- kernel/arm/sdot_vfp.S | 2 +- kernel/arm/sgemm_kernel_4x2_vfp.S | 50 +- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 74 +- kernel/arm/sgemm_ncopy_2_vfp.S | 14 +- kernel/arm/sgemm_ncopy_4_vfp.S | 20 +- kernel/arm/sgemm_tcopy_4_vfp.S | 18 +- kernel/arm/strmm_kernel_4x2_vfp.S | 42 +- kernel/arm/strmm_kernel_4x4_vfpv3.S | 68 +- kernel/arm/swap.c | 2 +- kernel/arm/zamax.c | 4 +- kernel/arm/zamin.c | 4 +- kernel/arm/zasum.c | 2 +- kernel/arm/zaxpby.c | 2 +- kernel/arm/zaxpy.c | 2 +- kernel/arm/zcopy.c | 2 +- kernel/arm/zcopy_vfp.S | 2 +- kernel/arm/zdot.c | 4 +- kernel/arm/zdot_vfp.S | 10 +- kernel/arm/zgemm_kernel_2x2_vfp.S | 28 +- kernel/arm/zgemm_kernel_2x2_vfpv3.S | 28 +- kernel/arm/zgemm_ncopy_2_vfp.S | 14 +- kernel/arm/zgemv_n.c | 4 +- kernel/arm/zgemv_n_vfp.S | 4 +- kernel/arm/zgemv_t.c | 2 +- kernel/arm/znrm2.c | 10 +- kernel/arm/zomatcopy_cn.c | 8 +- kernel/arm/zomatcopy_cnc.c | 10 +- kernel/arm/zomatcopy_ct.c | 10 +- kernel/arm/zomatcopy_ctc.c | 10 +- kernel/arm/zomatcopy_rn.c | 10 +- kernel/arm/zomatcopy_rnc.c | 10 +- kernel/arm/zomatcopy_rt.c | 10 +- kernel/arm/zomatcopy_rtc.c | 10 +- kernel/arm/zrot.c | 2 +- kernel/arm/zscal.c | 2 +- kernel/arm/zswap.c | 2 +- kernel/arm/ztrmm_kernel_2x2_vfp.S | 28 +- kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 28 +- kernel/arm64/KERNEL.ARMV8 | 4 +- kernel/generic/gemm_beta.c | 4 +- kernel/generic/gemm_ncopy_1.c | 4 +- kernel/generic/gemm_ncopy_16.c | 84 +- kernel/generic/gemm_ncopy_2.c | 8 +- kernel/generic/gemm_ncopy_4.c | 56 +- kernel/generic/gemm_ncopy_6.c | 56 +- kernel/generic/gemm_ncopy_8.c | 68 +- kernel/generic/gemm_tcopy_1.c | 2 +- kernel/generic/gemm_tcopy_16.c | 50 +- kernel/generic/gemm_tcopy_2.c | 4 +- kernel/generic/gemm_tcopy_4.c | 68 +- kernel/generic/gemm_tcopy_6.c | 68 +- kernel/generic/gemm_tcopy_8.c | 138 +- kernel/generic/gemmkernel_2x2.c | 24 +- kernel/generic/ger.c | 4 +- kernel/generic/laswp_ncopy_1.c | 40 +- kernel/generic/laswp_ncopy_2.c | 72 +- kernel/generic/laswp_ncopy_4.c | 86 +- kernel/generic/laswp_ncopy_8.c | 44 +- kernel/generic/neg_tcopy_1.c | 2 +- kernel/generic/neg_tcopy_16.c | 50 +- kernel/generic/neg_tcopy_2.c | 4 +- kernel/generic/neg_tcopy_4.c | 68 +- kernel/generic/neg_tcopy_8.c | 138 +- kernel/generic/symm_lcopy_1.c | 4 +- kernel/generic/symm_lcopy_16.c | 20 +- kernel/generic/symm_lcopy_2.c | 8 +- kernel/generic/symm_lcopy_4.c | 12 +- kernel/generic/symm_lcopy_6.c | 12 +- kernel/generic/symm_lcopy_8.c | 16 +- kernel/generic/symm_ucopy_1.c | 4 +- kernel/generic/symm_ucopy_16.c | 22 +- kernel/generic/symm_ucopy_2.c | 8 +- kernel/generic/symm_ucopy_4.c | 14 +- kernel/generic/symm_ucopy_6.c | 14 +- kernel/generic/symm_ucopy_8.c | 18 +- kernel/generic/symv_k.c | 18 +- kernel/generic/trmm_lncopy_1.c | 2 +- kernel/generic/trmm_lncopy_16.c | 64 +- kernel/generic/trmm_lncopy_2.c | 16 +- kernel/generic/trmm_lncopy_4.c | 68 +- kernel/generic/trmm_lncopy_6.c | 68 +- kernel/generic/trmm_lncopy_8.c | 178 +- kernel/generic/trmm_ltcopy_1.c | 4 +- kernel/generic/trmm_ltcopy_16.c | 94 +- kernel/generic/trmm_ltcopy_2.c | 16 +- kernel/generic/trmm_ltcopy_4.c | 72 +- kernel/generic/trmm_ltcopy_6.c | 72 +- kernel/generic/trmm_ltcopy_8.c | 140 +- kernel/generic/trmm_uncopy_1.c | 8 +- kernel/generic/trmm_uncopy_16.c | 76 +- kernel/generic/trmm_uncopy_2.c | 22 +- kernel/generic/trmm_uncopy_4.c | 74 +- kernel/generic/trmm_uncopy_6.c | 12 +- kernel/generic/trmm_uncopy_8.c | 182 +- kernel/generic/trmm_utcopy_1.c | 6 +- kernel/generic/trmm_utcopy_16.c | 94 +- kernel/generic/trmm_utcopy_2.c | 26 +- kernel/generic/trmm_utcopy_4.c | 64 +- kernel/generic/trmm_utcopy_6.c | 64 +- kernel/generic/trmm_utcopy_8.c | 160 +- kernel/generic/trmmkernel_16x2.c | 94 +- kernel/generic/trmmkernel_2x2.c | 52 +- kernel/generic/trmmkernel_8x2.c | 76 +- kernel/generic/trsm_kernel_LN.c | 74 +- kernel/generic/trsm_kernel_LT.c | 58 +- kernel/generic/trsm_kernel_RN.c | 62 +- kernel/generic/trsm_kernel_RT.c | 60 +- kernel/generic/trsm_lncopy_1.c | 2 +- kernel/generic/trsm_lncopy_16.c | 22 +- kernel/generic/trsm_lncopy_2.c | 4 +- kernel/generic/trsm_lncopy_4.c | 8 +- kernel/generic/trsm_lncopy_6.c | 8 +- kernel/generic/trsm_lncopy_8.c | 52 +- kernel/generic/trsm_ltcopy_16.c | 22 +- kernel/generic/trsm_ltcopy_2.c | 6 +- kernel/generic/trsm_ltcopy_4.c | 18 +- kernel/generic/trsm_ltcopy_6.c | 18 +- kernel/generic/trsm_ltcopy_8.c | 14 +- kernel/generic/trsm_uncopy_1.c | 2 +- kernel/generic/trsm_uncopy_16.c | 22 +- kernel/generic/trsm_uncopy_2.c | 4 +- kernel/generic/trsm_uncopy_4.c | 10 +- kernel/generic/trsm_uncopy_6.c | 10 +- kernel/generic/trsm_uncopy_8.c | 34 +- kernel/generic/trsm_utcopy_1.c | 2 +- kernel/generic/trsm_utcopy_16.c | 20 +- kernel/generic/trsm_utcopy_2.c | 4 +- kernel/generic/trsm_utcopy_4.c | 8 +- kernel/generic/trsm_utcopy_6.c | 8 +- kernel/generic/trsm_utcopy_8.c | 14 +- kernel/generic/zgemm3m_ncopy_1.c | 12 +- kernel/generic/zgemm3m_ncopy_2.c | 20 +- kernel/generic/zgemm3m_ncopy_4.c | 24 +- kernel/generic/zgemm3m_ncopy_8.c | 32 +- kernel/generic/zgemm3m_tcopy_1.c | 14 +- kernel/generic/zgemm3m_tcopy_2.c | 18 +- kernel/generic/zgemm3m_tcopy_4.c | 54 +- kernel/generic/zgemm3m_tcopy_8.c | 212 +-- kernel/generic/zgemm_beta.c | 16 +- kernel/generic/zgemm_ncopy_1.c | 16 +- kernel/generic/zgemm_ncopy_2.c | 32 +- kernel/generic/zgemm_ncopy_4.c | 84 +- kernel/generic/zgemm_ncopy_4_sandy.c | 48 +- kernel/generic/zgemm_ncopy_8.c | 26 +- kernel/generic/zgemm_ncopy_8_sandy.c | 56 +- kernel/generic/zgemm_tcopy_1.c | 28 +- kernel/generic/zgemm_tcopy_2.c | 52 +- kernel/generic/zgemm_tcopy_4.c | 74 +- kernel/generic/zgemm_tcopy_4_sandy.c | 48 +- kernel/generic/zgemm_tcopy_8.c | 44 +- kernel/generic/zgemm_tcopy_8_sandy.c | 54 +- kernel/generic/zgemmkernel_2x2.c | 22 +- kernel/generic/zger.c | 8 +- kernel/generic/zhemm3m_lcopy_1.c | 18 +- kernel/generic/zhemm3m_lcopy_2.c | 4 +- kernel/generic/zhemm3m_lcopy_4.c | 6 +- kernel/generic/zhemm3m_lcopy_8.c | 8 +- kernel/generic/zhemm3m_ucopy_1.c | 18 +- kernel/generic/zhemm3m_ucopy_2.c | 10 +- kernel/generic/zhemm3m_ucopy_4.c | 14 +- kernel/generic/zhemm3m_ucopy_8.c | 8 +- kernel/generic/zhemm_ltcopy_1.c | 4 +- kernel/generic/zhemm_ltcopy_2.c | 8 +- kernel/generic/zhemm_ltcopy_4.c | 12 +- kernel/generic/zhemm_ltcopy_8.c | 16 +- kernel/generic/zhemm_utcopy_1.c | 4 +- kernel/generic/zhemm_utcopy_2.c | 8 +- kernel/generic/zhemm_utcopy_4.c | 14 +- kernel/generic/zhemm_utcopy_8.c | 22 +- kernel/generic/zhemv_k.c | 12 +- kernel/generic/zlaswp_ncopy_1.c | 40 +- kernel/generic/zlaswp_ncopy_2.c | 76 +- kernel/generic/zlaswp_ncopy_4.c | 106 +- kernel/generic/zneg_tcopy_1.c | 28 +- kernel/generic/zneg_tcopy_2.c | 52 +- kernel/generic/zneg_tcopy_4.c | 74 +- kernel/generic/zneg_tcopy_8.c | 44 +- kernel/generic/zsymm3m_lcopy_1.c | 20 +- kernel/generic/zsymm3m_lcopy_2.c | 8 +- kernel/generic/zsymm3m_lcopy_4.c | 10 +- kernel/generic/zsymm3m_lcopy_8.c | 14 +- kernel/generic/zsymm3m_ucopy_1.c | 20 +- kernel/generic/zsymm3m_ucopy_2.c | 10 +- kernel/generic/zsymm3m_ucopy_4.c | 14 +- kernel/generic/zsymm3m_ucopy_8.c | 18 +- kernel/generic/zsymm_lcopy_1.c | 4 +- kernel/generic/zsymm_lcopy_2.c | 8 +- kernel/generic/zsymm_lcopy_4.c | 12 +- kernel/generic/zsymm_lcopy_8.c | 14 +- kernel/generic/zsymm_ucopy_1.c | 2 +- kernel/generic/zsymm_ucopy_2.c | 8 +- kernel/generic/zsymm_ucopy_4.c | 12 +- kernel/generic/zsymm_ucopy_8.c | 16 +- kernel/generic/zsymv_k.c | 10 +- kernel/generic/ztrmm_lncopy_1.c | 2 +- kernel/generic/ztrmm_lncopy_2.c | 12 +- kernel/generic/ztrmm_lncopy_4.c | 104 +- kernel/generic/ztrmm_lncopy_8.c | 50 +- kernel/generic/ztrmm_ltcopy_1.c | 6 +- kernel/generic/ztrmm_ltcopy_2.c | 28 +- kernel/generic/ztrmm_ltcopy_4.c | 116 +- kernel/generic/ztrmm_ltcopy_8.c | 84 +- kernel/generic/ztrmm_uncopy_1.c | 4 +- kernel/generic/ztrmm_uncopy_2.c | 22 +- kernel/generic/ztrmm_uncopy_4.c | 98 +- kernel/generic/ztrmm_uncopy_8.c | 66 +- kernel/generic/ztrmm_utcopy_1.c | 4 +- kernel/generic/ztrmm_utcopy_2.c | 36 +- kernel/generic/ztrmm_utcopy_4.c | 102 +- kernel/generic/ztrmm_utcopy_8.c | 102 +- kernel/generic/ztrmmkernel_2x2.c | 36 +- kernel/generic/ztrsm_lncopy_1.c | 2 +- kernel/generic/ztrsm_lncopy_2.c | 4 +- kernel/generic/ztrsm_lncopy_4.c | 12 +- kernel/generic/ztrsm_lncopy_8.c | 18 +- kernel/generic/ztrsm_ltcopy_1.c | 2 +- kernel/generic/ztrsm_ltcopy_2.c | 4 +- kernel/generic/ztrsm_ltcopy_4.c | 12 +- kernel/generic/ztrsm_ltcopy_8.c | 18 +- kernel/generic/ztrsm_uncopy_1.c | 2 +- kernel/generic/ztrsm_uncopy_2.c | 4 +- kernel/generic/ztrsm_uncopy_4.c | 12 +- kernel/generic/ztrsm_uncopy_8.c | 18 +- kernel/generic/ztrsm_utcopy_1.c | 2 +- kernel/generic/ztrsm_utcopy_2.c | 4 +- kernel/generic/ztrsm_utcopy_4.c | 12 +- kernel/generic/ztrsm_utcopy_8.c | 16 +- kernel/ia64/amax.S | 8 +- kernel/ia64/asum.S | 8 +- kernel/ia64/caxpy.S | 4 +- kernel/ia64/copy.S | 4 +- kernel/ia64/daxpy.S | 2 +- kernel/ia64/ddot.S | 10 +- kernel/ia64/gemm_beta.S | 10 +- kernel/ia64/gemm_kernel.S | 28 +- kernel/ia64/gemm_ncopy.S | 4 +- kernel/ia64/gemv_n.S | 16 +- kernel/ia64/gemv_t.S | 12 +- kernel/ia64/iamax.S | 2 +- kernel/ia64/izamax.S | 8 +- kernel/ia64/lsame.S | 2 +- kernel/ia64/nrm2.S | 6 +- kernel/ia64/qaxpy.S | 4 +- kernel/ia64/qgemm_kernel.S | 28 +- kernel/ia64/qgemv_n.S | 22 +- kernel/ia64/qgemv_t.S | 114 +- kernel/ia64/qscal.S | 2 +- kernel/ia64/saxpy.S | 8 +- kernel/ia64/scal.S | 2 +- kernel/ia64/sdot.S | 10 +- kernel/ia64/sgemv_n.S | 16 +- kernel/ia64/symv_U.S | 16 +- kernel/ia64/trsm_kernel_LN.S | 50 +- kernel/ia64/trsm_kernel_LT.S | 24 +- kernel/ia64/trsm_kernel_RT.S | 60 +- kernel/ia64/xdot.S | 8 +- kernel/ia64/zcopy.S | 2 +- kernel/ia64/zdot.S | 8 +- kernel/ia64/zgemm3m_kernel.S | 26 +- kernel/ia64/zgemm_beta.S | 12 +- kernel/ia64/zgemm_kernel.S | 22 +- kernel/ia64/zgemm_ncopy.S | 4 +- kernel/ia64/zgemv_n.S | 16 +- kernel/ia64/zgemv_t.S | 14 +- kernel/ia64/zscal.S | 2 +- kernel/ia64/zswap.S | 6 +- kernel/ia64/ztrsm_kernel_LN.S | 28 +- kernel/ia64/ztrsm_kernel_LT.S | 28 +- kernel/ia64/ztrsm_kernel_RT.S | 28 +- kernel/mips64/KERNEL.LOONGSON3A | 2 +- kernel/mips64/KERNEL.LOONGSON3B | 4 +- kernel/mips64/amax.S | 4 +- kernel/mips64/amin.S | 4 +- kernel/mips64/asum.S | 4 +- kernel/mips64/axpy.S | 2 +- kernel/mips64/axpy_loongson3a.S | 46 +- kernel/mips64/cgemm_kernel_loongson3a_2x2.S | 114 +- kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S | 102 +- kernel/mips64/cgemm_kernel_loongson3b_2x2.S | 114 +- kernel/mips64/cnrm2.S | 10 +- kernel/mips64/copy.S | 4 +- kernel/mips64/daxpy_loongson3a_simd.S | 104 +- kernel/mips64/dgemm_kernel_loongson3a_4x4.S | 484 +++--- kernel/mips64/dgemm_kernel_loongson3b_4x4.S | 414 ++--- kernel/mips64/dnrm2.S | 4 +- kernel/mips64/dot.S | 6 +- kernel/mips64/gemm_beta.S | 2 +- kernel/mips64/gemm_kernel.S | 6 +- kernel/mips64/gemv_n.S | 4 +- kernel/mips64/gemv_n_loongson3a.c | 4 +- kernel/mips64/gemv_t.S | 16 +- kernel/mips64/gemv_t_loongson3a.c | 2 +- kernel/mips64/iamax.S | 6 +- kernel/mips64/iamin.S | 6 +- kernel/mips64/imax.S | 6 +- kernel/mips64/imin.S | 6 +- kernel/mips64/izamax.S | 8 +- kernel/mips64/izamin.S | 8 +- kernel/mips64/max.S | 4 +- kernel/mips64/min.S | 4 +- kernel/mips64/rot.S | 4 +- kernel/mips64/scal.S | 6 +- kernel/mips64/sgemm_kernel_8x4_ps.S | 400 ++--- kernel/mips64/sgemm_kernel_loongson3a_4x4.S | 414 ++--- kernel/mips64/sgemm_kernel_loongson3b_4x4.S | 414 ++--- kernel/mips64/snrm2.S | 10 +- kernel/mips64/swap.S | 2 +- kernel/mips64/symv_L.S | 2 +- kernel/mips64/symv_U.S | 2 +- kernel/mips64/trsm_kernel_LN.S | 4 +- kernel/mips64/trsm_kernel_LN_loongson3a.S | 270 +-- kernel/mips64/trsm_kernel_LT.S | 4 +- kernel/mips64/trsm_kernel_LT_loongson3a.S | 244 +-- kernel/mips64/trsm_kernel_RN_loongson3a.S | 162 +- kernel/mips64/trsm_kernel_RT.S | 4 +- kernel/mips64/trsm_kernel_RT_loongson3a.S | 204 +-- kernel/mips64/zamax.S | 4 +- kernel/mips64/zamin.S | 4 +- kernel/mips64/zasum.S | 4 +- kernel/mips64/zaxpy.S | 4 +- kernel/mips64/zcopy.S | 4 +- kernel/mips64/zdot.S | 6 +- kernel/mips64/zgemm3m_kernel.S | 6 +- kernel/mips64/zgemm_kernel.S | 4 +- kernel/mips64/zgemm_kernel_loongson3a_2x2.S | 160 +- kernel/mips64/zgemm_kernel_loongson3b_2x2.S | 114 +- kernel/mips64/zgemv_n.S | 2 +- kernel/mips64/zgemv_n_loongson3a.c | 2 +- kernel/mips64/zgemv_t.S | 12 +- kernel/mips64/zgemv_t_loongson3a.c | 2 +- kernel/mips64/znrm2.S | 4 +- kernel/mips64/zrot.S | 4 +- kernel/mips64/zscal.S | 4 +- kernel/mips64/zswap.S | 2 +- kernel/mips64/zsymv_L.S | 2 +- kernel/mips64/zsymv_U.S | 4 +- kernel/mips64/ztrsm_kernel_LT.S | 4 +- kernel/mips64/ztrsm_kernel_RT.S | 2 +- kernel/power/KERNEL.CELL | 4 +- kernel/power/KERNEL.POWER5 | 12 +- kernel/power/KERNEL.POWER6 | 4 +- kernel/power/KERNEL.PPC440 | 12 +- kernel/power/KERNEL.PPC970 | 4 +- kernel/power/KERNEL.PPCG4 | 4 +- kernel/power/amax.S | 4 +- kernel/power/amax_cell.S | 6 +- kernel/power/amax_hummer.S | 6 +- kernel/power/amax_ppc440.S | 4 +- kernel/power/amin.S | 4 +- kernel/power/amin_cell.S | 6 +- kernel/power/amin_hummer.S | 6 +- kernel/power/amin_ppc440.S | 4 +- kernel/power/asum.S | 4 +- kernel/power/asum_cell.S | 4 +- kernel/power/asum_hummer.S | 6 +- kernel/power/asum_ppc440.S | 4 +- kernel/power/axpy.S | 16 +- kernel/power/axpy_hummer.S | 6 +- kernel/power/axpy_ppc440.S | 12 +- kernel/power/cnrm2.S | 4 +- kernel/power/cnrm2_hummer.S | 8 +- kernel/power/cnrm2_ppc440.S | 8 +- kernel/power/copy.S | 6 +- kernel/power/copy_hummer.S | 18 +- kernel/power/dnrm2_hummer.S | 10 +- kernel/power/dnrm2_ppc440.S | 6 +- kernel/power/dot.S | 4 +- kernel/power/dot_cell.S | 2 +- kernel/power/dot_hummer.S | 8 +- kernel/power/dot_ppc440.S | 2 +- kernel/power/exfunc.S | 2 +- kernel/power/gemm_beta.S | 6 +- kernel/power/gemm_kernel.S | 14 +- kernel/power/gemm_kernel_altivec.S | 10 +- kernel/power/gemm_kernel_altivec_cell.S | 10 +- kernel/power/gemm_kernel_altivec_g4.S | 10 +- kernel/power/gemm_kernel_cell.S | 12 +- kernel/power/gemm_kernel_g4.S | 12 +- kernel/power/gemm_kernel_hummer.S | 24 +- kernel/power/gemm_kernel_power3.S | 10 +- kernel/power/gemm_kernel_power6.S | 8 +- kernel/power/gemm_kernel_ppc440.S | 10 +- kernel/power/gemm_ncopy_4.S | 10 +- kernel/power/gemm_ncopy_hummer_4.S | 16 +- kernel/power/gemm_ncopy_hummer_8.S | 26 +- kernel/power/gemm_tcopy_4.S | 12 +- kernel/power/gemm_tcopy_hummer_4.S | 18 +- kernel/power/gemm_tcopy_hummer_8.S | 22 +- kernel/power/gemv_hummer_n.S | 6 +- kernel/power/gemv_n.S | 2 +- kernel/power/gemv_t.S | 12 +- kernel/power/gemv_t_ppc440.S | 6 +- kernel/power/ger.S | 4 +- kernel/power/iamax.S | 8 +- kernel/power/iamax_hummer.S | 6 +- kernel/power/iamax_ppc440.S | 8 +- kernel/power/iamin.S | 8 +- kernel/power/iamin_hummer.S | 6 +- kernel/power/iamin_ppc440.S | 8 +- kernel/power/imax.S | 4 +- kernel/power/imax_hummer.S | 6 +- kernel/power/imax_ppc440.S | 4 +- kernel/power/imin.S | 4 +- kernel/power/imin_hummer.S | 6 +- kernel/power/imin_ppc440.S | 6 +- kernel/power/izamax.S | 8 +- kernel/power/izamax_hummer.S | 6 +- kernel/power/izamax_ppc440.S | 6 +- kernel/power/izamin.S | 4 +- kernel/power/izamin_hummer.S | 6 +- kernel/power/izamin_ppc440.S | 6 +- kernel/power/max.S | 4 +- kernel/power/max_hummer.S | 6 +- kernel/power/max_ppc440.S | 4 +- kernel/power/min.S | 4 +- kernel/power/min_hummer.S | 6 +- kernel/power/min_ppc440.S | 4 +- kernel/power/nrm2.S | 4 +- kernel/power/rot.S | 4 +- kernel/power/rot_ppc440.S | 4 +- kernel/power/scal.S | 4 +- kernel/power/scal_hummer.S | 6 +- kernel/power/scal_ppc440.S | 4 +- kernel/power/snrm2.S | 4 +- kernel/power/snrm2_hummer.S | 4 +- kernel/power/snrm2_ppc440.S | 6 +- kernel/power/swap.S | 12 +- kernel/power/swap_hummer.S | 12 +- kernel/power/symv_L.S | 12 +- kernel/power/symv_U.S | 18 +- kernel/power/trsm_kernel_LN.S | 18 +- kernel/power/trsm_kernel_LT.S | 20 +- kernel/power/trsm_kernel_RT.S | 20 +- kernel/power/trsm_kernel_cell_LN.S | 20 +- kernel/power/trsm_kernel_cell_LT.S | 22 +- kernel/power/trsm_kernel_cell_RT.S | 20 +- kernel/power/trsm_kernel_hummer_LN.S | 16 +- kernel/power/trsm_kernel_hummer_LT.S | 16 +- kernel/power/trsm_kernel_hummer_RT.S | 16 +- kernel/power/trsm_kernel_power6_LN.S | 16 +- kernel/power/trsm_kernel_power6_LT.S | 18 +- kernel/power/trsm_kernel_power6_RT.S | 18 +- kernel/power/trsm_kernel_ppc440_LN.S | 16 +- kernel/power/trsm_kernel_ppc440_LT.S | 18 +- kernel/power/trsm_kernel_ppc440_RT.S | 18 +- kernel/power/zamax.S | 4 +- kernel/power/zamax_cell.S | 4 +- kernel/power/zamax_hummer.S | 6 +- kernel/power/zamax_ppc440.S | 6 +- kernel/power/zamin.S | 4 +- kernel/power/zamin_cell.S | 4 +- kernel/power/zamin_hummer.S | 6 +- kernel/power/zamin_ppc440.S | 4 +- kernel/power/zasum.S | 4 +- kernel/power/zasum_cell.S | 4 +- kernel/power/zasum_hummer.S | 6 +- kernel/power/zasum_ppc440.S | 4 +- kernel/power/zaxpy.S | 8 +- kernel/power/zaxpy_hummer.S | 6 +- kernel/power/zaxpy_ppc440.S | 6 +- kernel/power/zcopy.S | 6 +- kernel/power/zcopy_hummer.S | 14 +- kernel/power/zdot.S | 8 +- kernel/power/zdot_cell.S | 8 +- kernel/power/zdot_hummer.S | 8 +- kernel/power/zdot_ppc440.S | 6 +- kernel/power/zgemm_beta.S | 10 +- kernel/power/zgemm_kernel.S | 10 +- kernel/power/zgemm_kernel_altivec.S | 4 +- kernel/power/zgemm_kernel_altivec_cell.S | 4 +- kernel/power/zgemm_kernel_altivec_g4.S | 4 +- kernel/power/zgemm_kernel_cell.S | 6 +- kernel/power/zgemm_kernel_g4.S | 4 +- kernel/power/zgemm_kernel_hummer.S | 16 +- kernel/power/zgemm_kernel_power3.S | 8 +- kernel/power/zgemm_kernel_power6.S | 16 +- kernel/power/zgemm_kernel_ppc440.S | 4 +- kernel/power/zgemm_ncopy_hummer_2.S | 12 +- kernel/power/zgemm_ncopy_hummer_4.S | 16 +- kernel/power/zgemm_tcopy_hummer_2.S | 12 +- kernel/power/zgemm_tcopy_hummer_4.S | 16 +- kernel/power/zgemv_n.S | 4 +- kernel/power/zgemv_n_ppc440.S | 4 +- kernel/power/zgemv_t.S | 12 +- kernel/power/zgemv_t_ppc440.S | 8 +- kernel/power/zger.S | 4 +- kernel/power/znrm2.S | 4 +- kernel/power/znrm2_hummer.S | 10 +- kernel/power/znrm2_ppc440.S | 6 +- kernel/power/zrot.S | 6 +- kernel/power/zrot_ppc440.S | 4 +- kernel/power/zscal.S | 6 +- kernel/power/zscal_hummer.S | 6 +- kernel/power/zscal_ppc440.S | 6 +- kernel/power/zswap.S | 14 +- kernel/power/zswap_hummer.S | 12 +- kernel/power/zsymv_L.S | 6 +- kernel/power/zsymv_U.S | 6 +- kernel/power/ztrsm_kernel_LN.S | 10 +- kernel/power/ztrsm_kernel_LT.S | 10 +- kernel/power/ztrsm_kernel_RT.S | 12 +- kernel/power/ztrsm_kernel_cell_LN.S | 10 +- kernel/power/ztrsm_kernel_cell_LT.S | 10 +- kernel/power/ztrsm_kernel_cell_RT.S | 12 +- kernel/power/ztrsm_kernel_hummer_LN.S | 12 +- kernel/power/ztrsm_kernel_hummer_LT.S | 8 +- kernel/power/ztrsm_kernel_hummer_RT.S | 10 +- kernel/power/ztrsm_kernel_power6_LN.S | 24 +- kernel/power/ztrsm_kernel_power6_LT.S | 24 +- kernel/power/ztrsm_kernel_power6_RT.S | 24 +- kernel/power/ztrsm_kernel_ppc440_LN.S | 8 +- kernel/power/ztrsm_kernel_ppc440_LT.S | 8 +- kernel/power/ztrsm_kernel_ppc440_RT.S | 10 +- kernel/setparam-ref.c | 158 +- kernel/sparc/KERNEL.sparc | 4 +- kernel/sparc/axpy.S | 2 +- kernel/sparc/cabs.S | 2 +- kernel/sparc/dnrm2.S | 2 +- kernel/sparc/dot.S | 6 +- kernel/sparc/gemm_kernel_2x8.S | 4 +- kernel/sparc/gemv_n.S | 12 +- kernel/sparc/gemv_t.S | 8 +- kernel/sparc/ger.S | 6 +- kernel/sparc/imax.S | 2 +- kernel/sparc/lsame.S | 2 +- kernel/sparc/max.S | 2 +- kernel/sparc/rot.S | 4 +- kernel/sparc/scal.S | 2 +- kernel/sparc/swap.S | 2 +- kernel/sparc/trsm_kernel_LN_2x8.S | 2 +- kernel/sparc/trsm_kernel_LT_2x8.S | 2 +- kernel/sparc/trsm_kernel_RT.S | 2 +- kernel/sparc/trsm_kernel_RT_2x8.S | 2 +- kernel/sparc/zamax.S | 2 +- kernel/sparc/zasum.S | 2 +- kernel/sparc/zgemm_kernel.S | 4 +- kernel/sparc/zgemm_kernel_1x4.S | 6 +- kernel/sparc/zgemv_n.S | 6 +- kernel/sparc/zgemv_t.S | 6 +- kernel/sparc/znrm2.S | 2 +- kernel/sparc/zrot.S | 4 +- kernel/sparc/zscal.S | 2 +- kernel/sparc/zswap.S | 2 +- kernel/sparc/ztrsm_kernel_LN.S | 6 +- kernel/sparc/ztrsm_kernel_LT.S | 6 +- kernel/sparc/ztrsm_kernel_LT_1x4.S | 8 +- kernel/sparc/ztrsm_kernel_RT.S | 6 +- kernel/sparc/ztrsm_kernel_RT_1x4.S | 6 +- kernel/x86/KERNEL.ATOM | 8 +- kernel/x86/KERNEL.BARCELONA | 14 +- kernel/x86/KERNEL.BOBCAT | 14 +- kernel/x86/KERNEL.BULLDOZER | 14 +- kernel/x86/KERNEL.DUNNINGTON | 8 +- kernel/x86/KERNEL.OPTERON | 14 +- kernel/x86/KERNEL.PENRYN | 8 +- kernel/x86/KERNEL.PILEDRIVER | 14 +- kernel/x86/KERNEL.PRESCOTT | 14 +- kernel/x86/KERNEL.YONAH | 14 +- kernel/x86/amax.S | 46 +- kernel/x86/amax_sse.S | 20 +- kernel/x86/amax_sse2.S | 20 +- kernel/x86/asum.S | 10 +- kernel/x86/asum_sse.S | 8 +- kernel/x86/asum_sse2.S | 10 +- kernel/x86/axpy.S | 4 +- kernel/x86/axpy_sse.S | 6 +- kernel/x86/axpy_sse2.S | 6 +- kernel/x86/axpy_sse2_opteron.S | 4 +- kernel/x86/copy.S | 20 +- kernel/x86/copy_sse.S | 2 +- kernel/x86/copy_sse2.S | 2 +- kernel/x86/cpuid.S | 2 +- kernel/x86/dot.S | 6 +- kernel/x86/dot_amd.S | 6 +- kernel/x86/dot_sse.S | 10 +- kernel/x86/dot_sse2.S | 6 +- kernel/x86/dot_sse2_opteron.S | 6 +- kernel/x86/dot_sse_opteron.S | 8 +- kernel/x86/gemm_beta.S | 4 +- kernel/x86/gemm_kernel_1x4.S | 20 +- kernel/x86/gemm_kernel_2x2.S | 34 +- kernel/x86/gemm_kernel_2x2_atom.S | 24 +- kernel/x86/gemm_kernel_2x4_3dnow.S | 34 +- kernel/x86/gemm_kernel_2x4_barcelona.S | 34 +- kernel/x86/gemm_kernel_2x4_core2.S | 28 +- kernel/x86/gemm_kernel_2x4_penryn.S | 28 +- kernel/x86/gemm_kernel_2x4_sse2.S | 50 +- kernel/x86/gemm_kernel_2x4_sse3.S | 36 +- kernel/x86/gemm_kernel_4x2_core2.S | 30 +- kernel/x86/gemm_kernel_4x2_sse2.S | 68 +- kernel/x86/gemm_kernel_4x4_barcelona.S | 64 +- kernel/x86/gemm_kernel_4x4_penryn.S | 28 +- kernel/x86/gemm_kernel_4x4_sse.S | 66 +- kernel/x86/gemm_kernel_4x4_sse3.S | 62 +- kernel/x86/gemm_kernel_8x1_sse2.S | 20 +- kernel/x86/gemm_kernel_8x2_core2.S | 34 +- kernel/x86/gemm_kernel_8x2_sse.S | 104 +- kernel/x86/gemm_ncopy_2.S | 2 +- kernel/x86/gemm_ncopy_2_sse.S | 4 +- kernel/x86/gemm_ncopy_4_sse.S | 4 +- kernel/x86/gemm_tcopy_2.S | 4 +- kernel/x86/gemm_tcopy_2_sse.S | 4 +- kernel/x86/gemm_tcopy_4_sse.S | 4 +- kernel/x86/gemv_n.S | 10 +- kernel/x86/gemv_n_atom.S | 6 +- kernel/x86/gemv_n_sse.S | 6 +- kernel/x86/gemv_n_sse2.S | 6 +- kernel/x86/gemv_t.S | 4 +- kernel/x86/gemv_t_atom.S | 12 +- kernel/x86/gemv_t_sse.S | 20 +- kernel/x86/gemv_t_sse2.S | 14 +- kernel/x86/iamax.S | 46 +- kernel/x86/iamax_sse.S | 42 +- kernel/x86/iamax_sse2.S | 44 +- kernel/x86/izamax.S | 52 +- kernel/x86/izamax_sse.S | 26 +- kernel/x86/izamax_sse2.S | 16 +- kernel/x86/nrm2.S | 10 +- kernel/x86/nrm2_sse.S | 12 +- kernel/x86/qaxpy.S | 4 +- kernel/x86/qdot.S | 2 +- kernel/x86/qgemm_kernel_2x2.S | 38 +- kernel/x86/qgemv_n.S | 10 +- kernel/x86/qgemv_t.S | 4 +- kernel/x86/qtrsm_kernel_LN_2x2.S | 28 +- kernel/x86/qtrsm_kernel_LT_2x2.S | 28 +- kernel/x86/qtrsm_kernel_RT_2x2.S | 28 +- kernel/x86/rot.S | 8 +- kernel/x86/rot_sse.S | 6 +- kernel/x86/rot_sse2.S | 4 +- kernel/x86/scal_sse.S | 6 +- kernel/x86/scal_sse2.S | 6 +- kernel/x86/swap.S | 2 +- kernel/x86/swap_sse.S | 10 +- kernel/x86/swap_sse2.S | 4 +- kernel/x86/trsm_kernel_LN_2x2.S | 24 +- kernel/x86/trsm_kernel_LN_2x2_atom.S | 16 +- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 24 +- kernel/x86/trsm_kernel_LN_2x4_sse2.S | 44 +- kernel/x86/trsm_kernel_LN_2x4_sse3.S | 22 +- kernel/x86/trsm_kernel_LN_4x2_core2.S | 54 +- kernel/x86/trsm_kernel_LN_4x2_sse2.S | 62 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 28 +- kernel/x86/trsm_kernel_LN_4x4_sse.S | 46 +- kernel/x86/trsm_kernel_LN_8x2_sse.S | 78 +- kernel/x86/trsm_kernel_LT_1x4.S | 22 +- kernel/x86/trsm_kernel_LT_2x2.S | 24 +- kernel/x86/trsm_kernel_LT_2x2_atom.S | 16 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 22 +- kernel/x86/trsm_kernel_LT_2x4_sse2.S | 44 +- kernel/x86/trsm_kernel_LT_2x4_sse3.S | 22 +- kernel/x86/trsm_kernel_LT_4x2_core2.S | 54 +- kernel/x86/trsm_kernel_LT_4x2_sse2.S | 60 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 28 +- kernel/x86/trsm_kernel_LT_4x4_sse.S | 46 +- kernel/x86/trsm_kernel_LT_8x2_sse.S | 78 +- kernel/x86/trsm_kernel_RT_1x4.S | 22 +- kernel/x86/trsm_kernel_RT_2x2.S | 24 +- kernel/x86/trsm_kernel_RT_2x2_atom.S | 16 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 24 +- kernel/x86/trsm_kernel_RT_2x4_sse2.S | 44 +- kernel/x86/trsm_kernel_RT_2x4_sse3.S | 22 +- kernel/x86/trsm_kernel_RT_4x2_core2.S | 54 +- kernel/x86/trsm_kernel_RT_4x2_sse2.S | 60 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 28 +- kernel/x86/trsm_kernel_RT_4x4_sse.S | 46 +- kernel/x86/trsm_kernel_RT_8x2_sse.S | 78 +- kernel/x86/xaxpy.S | 4 +- kernel/x86/xdot.S | 2 +- kernel/x86/xgemm3m_kernel_2x2.S | 38 +- kernel/x86/xgemm_kernel_1x1.S | 26 +- kernel/x86/xgemv_n.S | 2 +- kernel/x86/xgemv_t.S | 2 +- kernel/x86/xtrsm_kernel_LT_1x1.S | 26 +- kernel/x86/zamax.S | 52 +- kernel/x86/zamax_sse.S | 18 +- kernel/x86/zamax_sse2.S | 12 +- kernel/x86/zasum.S | 10 +- kernel/x86/zasum_sse.S | 16 +- kernel/x86/zasum_sse2.S | 10 +- kernel/x86/zaxpy.S | 2 +- kernel/x86/zaxpy_sse.S | 6 +- kernel/x86/zaxpy_sse2.S | 8 +- kernel/x86/zcopy.S | 30 +- kernel/x86/zcopy_sse.S | 4 +- kernel/x86/zcopy_sse2.S | 2 +- kernel/x86/zdot.S | 6 +- kernel/x86/zdot_amd.S | 6 +- kernel/x86/zdot_sse.S | 2 +- kernel/x86/zdot_sse2.S | 234 +-- kernel/x86/zgemm3m_kernel_1x4_athlon.S | 30 +- kernel/x86/zgemm3m_kernel_2x2_atom.S | 24 +- kernel/x86/zgemm3m_kernel_2x2_coppermine.S | 32 +- kernel/x86/zgemm3m_kernel_2x4_barcelona.S | 34 +- kernel/x86/zgemm3m_kernel_2x4_opteron.S | 50 +- kernel/x86/zgemm3m_kernel_2x4_penryn.S | 28 +- kernel/x86/zgemm3m_kernel_2x4_prescott.S | 36 +- kernel/x86/zgemm3m_kernel_4x2_core2.S | 28 +- kernel/x86/zgemm3m_kernel_4x2_northwood.S | 66 +- kernel/x86/zgemm3m_kernel_4x4_barcelona.S | 64 +- kernel/x86/zgemm3m_kernel_4x4_opteron.S | 64 +- kernel/x86/zgemm3m_kernel_4x4_penryn.S | 30 +- kernel/x86/zgemm3m_kernel_4x4_prescott.S | 62 +- kernel/x86/zgemm3m_kernel_8x2_core2.S | 34 +- kernel/x86/zgemm3m_kernel_8x2_sse.S | 102 +- kernel/x86/zgemm_beta.S | 2 +- kernel/x86/zgemm_kernel_1x1.S | 8 +- kernel/x86/zgemm_kernel_1x1_atom.S | 14 +- kernel/x86/zgemm_kernel_1x2.S | 20 +- kernel/x86/zgemm_kernel_1x2_3dnow.S | 16 +- kernel/x86/zgemm_kernel_1x2_barcelona.S | 20 +- kernel/x86/zgemm_kernel_1x2_penryn.S | 22 +- kernel/x86/zgemm_kernel_1x2_sse2.S | 32 +- kernel/x86/zgemm_kernel_1x2_sse3.S | 28 +- kernel/x86/zgemm_kernel_2x1_core2.S | 20 +- kernel/x86/zgemm_kernel_2x1_sse2.S | 28 +- kernel/x86/zgemm_kernel_2x2_barcelona.S | 46 +- kernel/x86/zgemm_kernel_2x2_penryn.S | 46 +- kernel/x86/zgemm_kernel_2x2_sse.S | 48 +- kernel/x86/zgemm_kernel_2x2_sse3.S | 46 +- kernel/x86/zgemm_kernel_4x1_core2.S | 28 +- kernel/x86/zgemm_kernel_4x1_sse.S | 48 +- kernel/x86/zgemm_ncopy_2.S | 2 +- kernel/x86/zgemm_tcopy_2.S | 4 +- kernel/x86/zgemv_n.S | 4 +- kernel/x86/zgemv_n_atom.S | 6 +- kernel/x86/zgemv_n_sse.S | 8 +- kernel/x86/zgemv_n_sse2.S | 8 +- kernel/x86/zgemv_t.S | 2 +- kernel/x86/zgemv_t_atom.S | 10 +- kernel/x86/zgemv_t_sse.S | 16 +- kernel/x86/zgemv_t_sse2.S | 10 +- kernel/x86/znrm2.S | 10 +- kernel/x86/znrm2_sse.S | 12 +- kernel/x86/zrot.S | 8 +- kernel/x86/zrot_sse.S | 4 +- kernel/x86/zrot_sse2.S | 2 +- kernel/x86/zscal.S | 2 +- kernel/x86/zscal_sse.S | 2 +- kernel/x86/zscal_sse2.S | 16 +- kernel/x86/zswap.S | 2 +- kernel/x86/zswap_sse.S | 8 +- kernel/x86/zswap_sse2.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x1_core2.S | 14 +- kernel/x86/ztrsm_kernel_LN_2x1_sse2.S | 18 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 42 +- kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 30 +- kernel/x86/ztrsm_kernel_LN_4x1_sse.S | 24 +- kernel/x86/ztrsm_kernel_LT_1x1.S | 8 +- kernel/x86/ztrsm_kernel_LT_1x1_atom.S | 12 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 20 +- kernel/x86/ztrsm_kernel_LT_1x2_sse2.S | 28 +- kernel/x86/ztrsm_kernel_LT_1x2_sse3.S | 20 +- kernel/x86/ztrsm_kernel_LT_2x1_core2.S | 16 +- kernel/x86/ztrsm_kernel_LT_2x1_sse2.S | 18 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 42 +- kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 28 +- kernel/x86/ztrsm_kernel_LT_4x1_sse.S | 24 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 20 +- kernel/x86/ztrsm_kernel_RT_1x2_sse2.S | 24 +- kernel/x86/ztrsm_kernel_RT_1x2_sse3.S | 20 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 42 +- kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 28 +- kernel/x86_64/KERNEL | 2 +- kernel/x86_64/KERNEL.ATOM | 4 +- kernel/x86_64/KERNEL.BARCELONA | 4 +- kernel/x86_64/KERNEL.BOBCAT | 4 +- kernel/x86_64/KERNEL.BULLDOZER | 4 +- kernel/x86_64/KERNEL.CORE2 | 4 +- kernel/x86_64/KERNEL.DUNNINGTON | 4 +- kernel/x86_64/KERNEL.HASWELL | 8 +- kernel/x86_64/KERNEL.NANO | 4 +- kernel/x86_64/KERNEL.NEHALEM | 4 +- kernel/x86_64/KERNEL.OPTERON | 4 +- kernel/x86_64/KERNEL.OPTERON_SSE3 | 4 +- kernel/x86_64/KERNEL.PENRYN | 4 +- kernel/x86_64/KERNEL.PILEDRIVER | 4 +- kernel/x86_64/KERNEL.PRESCOTT | 4 +- kernel/x86_64/KERNEL.SANDYBRIDGE | 8 +- kernel/x86_64/amax.S | 44 +- kernel/x86_64/amax_atom.S | 16 +- kernel/x86_64/amax_sse.S | 18 +- kernel/x86_64/amax_sse2.S | 18 +- kernel/x86_64/asum.S | 6 +- kernel/x86_64/asum_atom.S | 22 +- kernel/x86_64/asum_sse.S | 14 +- kernel/x86_64/asum_sse2.S | 16 +- kernel/x86_64/axpy.S | 6 +- kernel/x86_64/axpy_atom.S | 2 +- kernel/x86_64/axpy_sse.S | 8 +- kernel/x86_64/axpy_sse2.S | 6 +- kernel/x86_64/builtin_stinit.S | 2 +- kernel/x86_64/cabs.S | 6 +- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 108 +- kernel/x86_64/cgemv_n.S | 74 +- kernel/x86_64/cgemv_t.S | 28 +- kernel/x86_64/copy.S | 10 +- kernel/x86_64/daxpy_bulldozer.S | 6 +- kernel/x86_64/ddot_bulldozer.S | 4 +- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 154 +- kernel/x86_64/dgemm_kernel_6x4_piledriver.S | 30 +- kernel/x86_64/dgemm_ncopy_2.S | 2 +- kernel/x86_64/dgemm_ncopy_4.S | 2 +- kernel/x86_64/dgemm_ncopy_8.S | 2 +- kernel/x86_64/dgemm_ncopy_8_bulldozer.S | 2 +- kernel/x86_64/dgemm_tcopy_2.S | 4 +- kernel/x86_64/dgemm_tcopy_4.S | 4 +- kernel/x86_64/dgemm_tcopy_8.S | 4 +- kernel/x86_64/dgemm_tcopy_8_bulldozer.S | 4 +- kernel/x86_64/dgemv_n.S | 26 +- kernel/x86_64/dgemv_n_atom.S | 8 +- kernel/x86_64/dgemv_n_bulldozer.S | 22 +- kernel/x86_64/dgemv_t.S | 8 +- kernel/x86_64/dgemv_t_atom.S | 10 +- kernel/x86_64/dgemv_t_bulldozer.S | 10 +- kernel/x86_64/dot_atom.S | 4 +- kernel/x86_64/dot_sse.S | 6 +- kernel/x86_64/dot_sse2.S | 4 +- kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S | 96 +- kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S | 46 +- kernel/x86_64/gemm_beta.S | 2 +- kernel/x86_64/gemm_kernel_2x8_nehalem.S | 72 +- kernel/x86_64/gemm_kernel_4x2_atom.S | 48 +- kernel/x86_64/gemm_kernel_4x4_barcelona.S | 80 +- kernel/x86_64/gemm_kernel_4x4_core2.S | 102 +- kernel/x86_64/gemm_kernel_4x4_penryn.S | 88 +- kernel/x86_64/gemm_kernel_4x4_sse2.S | 90 +- kernel/x86_64/gemm_kernel_4x4_sse3.S | 80 +- kernel/x86_64/gemm_kernel_4x8_nano.S | 106 +- kernel/x86_64/gemm_kernel_4x8_nehalem.S | 114 +- kernel/x86_64/gemm_kernel_8x4_barcelona.S | 116 +- kernel/x86_64/gemm_kernel_8x4_core2.S | 118 +- kernel/x86_64/gemm_kernel_8x4_penryn.S | 100 +- kernel/x86_64/gemm_kernel_8x4_sse.S | 120 +- kernel/x86_64/gemm_kernel_8x4_sse3.S | 114 +- kernel/x86_64/gemm_ncopy_2.S | 2 +- kernel/x86_64/gemm_ncopy_2_bulldozer.S | 2 +- kernel/x86_64/gemm_ncopy_4.S | 2 +- kernel/x86_64/gemm_ncopy_4_opteron.S | 4 +- kernel/x86_64/gemm_tcopy_2.S | 4 +- kernel/x86_64/gemm_tcopy_2_bulldozer.S | 10 +- kernel/x86_64/gemm_tcopy_4.S | 2 +- kernel/x86_64/gemm_tcopy_4_opteron.S | 2 +- kernel/x86_64/iamax.S | 44 +- kernel/x86_64/iamax_sse.S | 42 +- kernel/x86_64/iamax_sse2.S | 40 +- kernel/x86_64/izamax.S | 50 +- kernel/x86_64/izamax_sse.S | 24 +- kernel/x86_64/izamax_sse2.S | 14 +- kernel/x86_64/nrm2.S | 8 +- kernel/x86_64/nrm2_sse.S | 10 +- kernel/x86_64/qdot.S | 2 +- kernel/x86_64/qgemm_kernel_2x2.S | 40 +- kernel/x86_64/qgemv_n.S | 10 +- kernel/x86_64/qgemv_t.S | 4 +- kernel/x86_64/qtrsm_kernel_LN_2x2.S | 34 +- kernel/x86_64/qtrsm_kernel_LT_2x2.S | 34 +- kernel/x86_64/qtrsm_kernel_RT_2x2.S | 34 +- kernel/x86_64/rot.S | 6 +- kernel/x86_64/rot_sse.S | 6 +- kernel/x86_64/rot_sse2.S | 6 +- kernel/x86_64/scal_atom.S | 6 +- kernel/x86_64/scal_sse.S | 12 +- kernel/x86_64/scal_sse2.S | 10 +- kernel/x86_64/sgemm_kernel_8x4_bulldozer.S | 126 +- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 150 +- kernel/x86_64/sgemv_n.S | 44 +- kernel/x86_64/sgemv_t.S | 236 +-- kernel/x86_64/swap.S | 2 +- kernel/x86_64/swap_sse.S | 6 +- kernel/x86_64/swap_sse2.S | 2 +- kernel/x86_64/symv_L_sse.S | 8 +- kernel/x86_64/symv_L_sse2.S | 8 +- kernel/x86_64/symv_U_sse.S | 8 +- kernel/x86_64/symv_U_sse2.S | 10 +- kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S | 68 +- kernel/x86_64/trsm_kernel_LN_4x2_atom.S | 42 +- kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S | 58 +- kernel/x86_64/trsm_kernel_LN_4x4_core2.S | 82 +- kernel/x86_64/trsm_kernel_LN_4x4_penryn.S | 76 +- kernel/x86_64/trsm_kernel_LN_4x4_sse2.S | 74 +- kernel/x86_64/trsm_kernel_LN_4x4_sse3.S | 64 +- kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S | 102 +- kernel/x86_64/trsm_kernel_LN_8x4_sse.S | 86 +- kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S | 70 +- kernel/x86_64/trsm_kernel_LT_4x2_atom.S | 44 +- kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S | 60 +- kernel/x86_64/trsm_kernel_LT_4x4_core2.S | 82 +- kernel/x86_64/trsm_kernel_LT_4x4_penryn.S | 76 +- kernel/x86_64/trsm_kernel_LT_4x4_sse2.S | 82 +- kernel/x86_64/trsm_kernel_LT_4x4_sse3.S | 62 +- kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S | 106 +- kernel/x86_64/trsm_kernel_LT_8x4_sse.S | 92 +- kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S | 68 +- kernel/x86_64/trsm_kernel_RT_4x2_atom.S | 42 +- kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S | 60 +- kernel/x86_64/trsm_kernel_RT_4x4_core2.S | 80 +- kernel/x86_64/trsm_kernel_RT_4x4_penryn.S | 76 +- kernel/x86_64/trsm_kernel_RT_4x4_sse2.S | 78 +- kernel/x86_64/trsm_kernel_RT_4x4_sse3.S | 62 +- kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S | 106 +- kernel/x86_64/trsm_kernel_RT_8x4_sse.S | 90 +- kernel/x86_64/xdot.S | 2 +- kernel/x86_64/xgemm3m_kernel_2x2.S | 40 +- kernel/x86_64/xgemm_kernel_1x1.S | 28 +- kernel/x86_64/xgemv_n.S | 6 +- kernel/x86_64/xgemv_t.S | 4 +- kernel/x86_64/xtrsm_kernel_LT_1x1.S | 30 +- kernel/x86_64/zamax.S | 50 +- kernel/x86_64/zamax_atom.S | 12 +- kernel/x86_64/zamax_sse.S | 16 +- kernel/x86_64/zamax_sse2.S | 10 +- kernel/x86_64/zasum.S | 6 +- kernel/x86_64/zasum_atom.S | 20 +- kernel/x86_64/zasum_sse.S | 16 +- kernel/x86_64/zasum_sse2.S | 14 +- kernel/x86_64/zaxpy.S | 6 +- kernel/x86_64/zaxpy_atom.S | 6 +- kernel/x86_64/zaxpy_sse.S | 12 +- kernel/x86_64/zaxpy_sse2.S | 18 +- kernel/x86_64/zcopy.S | 2 +- kernel/x86_64/zcopy_sse.S | 4 +- kernel/x86_64/zdot.S | 2 +- kernel/x86_64/zdot_sse.S | 488 +++--- kernel/x86_64/zdot_sse2.S | 236 +-- kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S | 72 +- kernel/x86_64/zgemm3m_kernel_4x2_atom.S | 16 +- kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S | 108 +- kernel/x86_64/zgemm3m_kernel_4x4_core2.S | 116 +- kernel/x86_64/zgemm3m_kernel_4x4_penryn.S | 82 +- kernel/x86_64/zgemm3m_kernel_4x4_sse2.S | 108 +- kernel/x86_64/zgemm3m_kernel_4x4_sse3.S | 86 +- kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S | 112 +- kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S | 118 +- kernel/x86_64/zgemm3m_kernel_8x4_core2.S | 128 +- kernel/x86_64/zgemm3m_kernel_8x4_penryn.S | 106 +- kernel/x86_64/zgemm3m_kernel_8x4_sse.S | 124 +- kernel/x86_64/zgemm3m_kernel_8x4_sse3.S | 116 +- kernel/x86_64/zgemm_beta.S | 2 +- kernel/x86_64/zgemm_kernel_1x4_nehalem.S | 30 +- kernel/x86_64/zgemm_kernel_2x1_atom.S | 28 +- kernel/x86_64/zgemm_kernel_2x2_barcelona.S | 42 +- kernel/x86_64/zgemm_kernel_2x2_core2.S | 56 +- kernel/x86_64/zgemm_kernel_2x2_penryn.S | 44 +- kernel/x86_64/zgemm_kernel_2x2_sse2.S | 68 +- kernel/x86_64/zgemm_kernel_2x2_sse3.S | 48 +- kernel/x86_64/zgemm_kernel_2x4_nehalem.S | 66 +- kernel/x86_64/zgemm_kernel_4x2_barcelona.S | 78 +- kernel/x86_64/zgemm_kernel_4x2_core2.S | 72 +- kernel/x86_64/zgemm_kernel_4x2_penryn.S | 64 +- kernel/x86_64/zgemm_kernel_4x2_sse.S | 80 +- kernel/x86_64/zgemm_kernel_4x2_sse3.S | 84 +- kernel/x86_64/zgemm_kernel_4x4_sandy.S | 92 +- kernel/x86_64/zgemm_ncopy_1.S | 2 +- kernel/x86_64/zgemm_ncopy_2.S | 2 +- kernel/x86_64/zgemm_tcopy_1.S | 4 +- kernel/x86_64/zgemm_tcopy_2.S | 2 +- kernel/x86_64/zgemv_n.S | 22 +- kernel/x86_64/zgemv_n_atom.S | 8 +- kernel/x86_64/zgemv_n_dup.S | 8 +- kernel/x86_64/zgemv_t.S | 22 +- kernel/x86_64/zgemv_t_atom.S | 10 +- kernel/x86_64/zgemv_t_dup.S | 16 +- kernel/x86_64/znrm2.S | 8 +- kernel/x86_64/znrm2_sse.S | 10 +- kernel/x86_64/zrot.S | 6 +- kernel/x86_64/zrot_sse.S | 6 +- kernel/x86_64/zrot_sse2.S | 4 +- kernel/x86_64/zscal_atom.S | 6 +- kernel/x86_64/zscal_sse.S | 6 +- kernel/x86_64/zscal_sse2.S | 18 +- kernel/x86_64/zswap.S | 2 +- kernel/x86_64/zswap_sse.S | 6 +- kernel/x86_64/zsymv_L_sse.S | 8 +- kernel/x86_64/zsymv_L_sse2.S | 8 +- kernel/x86_64/zsymv_U_sse.S | 10 +- kernel/x86_64/zsymv_U_sse2.S | 8 +- kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S | 24 +- kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S | 46 +- kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S | 40 +- kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S | 50 +- kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S | 38 +- kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S | 56 +- kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S | 52 +- kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S | 32 +- kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S | 26 +- kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S | 50 +- kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S | 40 +- kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S | 48 +- kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S | 40 +- kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S | 62 +- kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S | 56 +- kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S | 32 +- kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S | 48 +- kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S | 40 +- kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S | 50 +- kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S | 40 +- kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S | 62 +- kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S | 56 +- lapack-devel.log | 14 +- lapack/getf2/getf2_k.c | 8 +- lapack/getf2/zgetf2_k.c | 10 +- lapack/getrf/getrf_parallel.c | 190 +-- lapack/getrf/getrf_parallel_omp.c | 36 +- lapack/getrf/getrf_single.c | 38 +- lapack/getrs/getrs_parallel.c | 8 +- lapack/getrs/zgetrs_parallel.c | 2 +- lapack/getrs/zgetrs_single.c | 4 +- lapack/laswp/generic/Makefile | 2 +- lapack/laswp/generic/laswp_k_1.c | 48 +- lapack/laswp/generic/laswp_k_2.c | 98 +- lapack/laswp/generic/laswp_k_4.c | 132 +- lapack/laswp/generic/laswp_k_8.c | 180 +- lapack/laswp/generic/zlaswp_k_1.c | 52 +- lapack/laswp/generic/zlaswp_k_2.c | 90 +- lapack/laswp/generic/zlaswp_k_4.c | 140 +- lapack/lauu2/lauu2_L.c | 10 +- lapack/lauu2/lauu2_U.c | 10 +- lapack/lauu2/zlauu2_L.c | 8 +- lapack/lauu2/zlauu2_U.c | 10 +- lapack/lauum/lauum_L_parallel.c | 12 +- lapack/lauum/lauum_L_single.c | 80 +- lapack/lauum/lauum_U_parallel.c | 10 +- lapack/lauum/lauum_U_single.c | 76 +- lapack/potf2/potf2_L.c | 6 +- lapack/potf2/potf2_U.c | 6 +- lapack/potf2/zpotf2_L.c | 4 +- lapack/potf2/zpotf2_U.c | 6 +- lapack/potrf/potrf_L_parallel.c | 14 +- lapack/potrf/potrf_L_single.c | 24 +- lapack/potrf/potrf_U_parallel.c | 14 +- lapack/potrf/potrf_U_single.c | 42 +- lapack/potrf/potrf_parallel.c | 114 +- lapack/trti2/trti2_L.c | 4 +- lapack/trti2/trti2_U.c | 8 +- lapack/trti2/ztrti2_L.c | 6 +- lapack/trti2/ztrti2_U.c | 10 +- lapack/trtri/trtri_L_parallel.c | 8 +- lapack/trtri/trtri_U_parallel.c | 8 +- param.h | 44 +- reference/Makefile | 8 +- reference/cspmvf.f | 2 +- reference/ctpmvf.f | 2 +- reference/sgetrff.f | 2 +- reference/sgetrsf.f | 2 +- reference/spotrff.f | 2 +- reference/strtrif.f | 2 +- reference/ztpmvf.f | 2 +- reference/ztrmvf.f | 2 +- segfaults.patch | 2 +- symcopy.h | 352 ++-- test/Makefile | 2 +- test/get_threading_model.c | 24 +- test/sblat2.f | 2 +- utest/Makefile | 2 +- utest/common_utest.h | 22 +- utest/main.c | 42 +- utest/test_amax.c | 24 +- utest/test_axpy.c | 22 +- utest/test_dotu.c | 26 +- utest/test_dsdot.c | 26 +- utest/test_fork.c | 8 +- utest/test_rot.c | 22 +- utest/test_rotmg.c | 24 +- version.h | 22 +- 1423 files changed, 21229 insertions(+), 21229 deletions(-) diff --git a/.travis.yml b/.travis.yml index 46d70a075..7d625c9dc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ env: before_install: - sudo apt-get update -qq - - sudo apt-get install -qq gfortran + - sudo apt-get install -qq gfortran - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 32c30e477..58748ea1c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -31,7 +31,7 @@ * Improve the windows build. * Chen Shaohu - * Optimize GEMV on the Loongson 3A processor. + * Optimize GEMV on the Loongson 3A processor. * Luo Wen * Intern. Test Level-2 BLAS. @@ -53,11 +53,11 @@ In chronological order: * [2012-05-19] Fix building bug on FreeBSD and NetBSD. * Sylvestre Ledru - * [2012-07-01] Improve the detection of sparc. Fix building bug under + * [2012-07-01] Improve the detection of sparc. Fix building bug under Hurd and kfreebsd. * Jameson Nash - * [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to + * [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make on the command line. * Alexander Nasonov @@ -80,7 +80,7 @@ In chronological order: * [2013-06-30] Add Intel Haswell support (using sandybridge optimizations). * grisuthedragon - * [2013-07-11] create openblas_get_parallel to retrieve information which parallelization + * [2013-07-11] create openblas_get_parallel to retrieve information which parallelization model is used by OpenBLAS. * Elliot Saba diff --git a/Changelog.txt b/Changelog.txt index 195d98b35..f531356d8 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -55,25 +55,25 @@ Version 0.2.7 common: * Support LSB (Linux Standard Base) 4.1. e.g. make CC=lsbcc - * Include LAPACK 3.4.2 source codes to the repo. + * Include LAPACK 3.4.2 source codes to the repo. Avoid downloading at compile time. * Add NO_PARALLEL_MAKE flag to disable parallel make. - * Create openblas_get_parallel to retrieve information which + * Create openblas_get_parallel to retrieve information which parallelization model is used by OpenBLAS. (Thank grisuthedragon) * Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X. * Change LIBSUFFIX from .lib to .a on windows. * A work-around for dtrti_U single thread bug. Replace it with LAPACK codes. (#191) x86/x86-64: - * Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on + * Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on AMD Bulldozer. (Thank Werner Saar) * Add Intel Haswell support (using Sandybridge optimizations). (Thank Dan Luu) * Add AMD Piledriver support (using Bulldozer optimizations). - * Fix the computational error in zgemm avx kernel on + * Fix the computational error in zgemm avx kernel on Sandybridge. (#237) * Fix the overflow bug in gemv. - * Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS + * Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS is very large.(#214, #221, #246). MIPS64: * Support loongcc (Open64 based) compiler for ICT Loongson 3A/B. @@ -110,7 +110,7 @@ common: * Fixed NetBSD build. (#155) * Fixed compilation with TARGET=GENERIC. (#160) x86/x86-64: - * Restore the original CPU affinity when calling + * Restore the original CPU affinity when calling openblas_set_num_threads(1) (#153) * Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154) MIPS64: @@ -120,13 +120,13 @@ Version 0.2.4 8-Oct-2012 common: * Upgraded LAPACK to 3.4.2 version. (#145) - * Provided support for passing CFLAGS, FFLAGS, PFLAGS, + * Provided support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make. (#137) - * f77blas.h:compatibility for compilers without C99 complex + * f77blas.h:compatibility for compilers without C99 complex number support. (#141) x86/x86-64: * Added NO_AVX flag. Check OS supporting AVX on runtime. (#139) - * Fixed zdot incompatibility ABI issue with GCC 4.7 on + * Fixed zdot incompatibility ABI issue with GCC 4.7 on Windows 32-bit. (#140) MIPS64: * Fixed the generation of shared library bug. @@ -136,14 +136,14 @@ Version 0.2.3 20-Aug-2012 common: * Fixed LAPACK unstable bug about ?laswp. (#130) - * Fixed the shared library bug about unloading the library on + * Fixed the shared library bug about unloading the library on Linux (#132). * Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2) Please use gcc and IBM xlf. (#134) x86/x86-64: - * Supported goto_set_num_threads and openblas_set_num_threads + * Supported goto_set_num_threads and openblas_set_num_threads APIs in Windows. They can set the number of threads on runtime. - + ==================================================================== Version 0.2.2 6-July-2012 @@ -191,14 +191,14 @@ x86/x86_64: * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. * Test alpha=Nan in dscale. * Fixed a SEGFAULT bug in samax on x86 windows. - + ==================================================================== Version 0.1.0 23-Mar-2012 common: * Set soname of shared library on Linux. - * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use - this flag to control the library name, e.g. libopenblas.a, + * Added LIBNAMESUFFIX flag in Makefile.rule. The user can use + this flag to control the library name, e.g. libopenblas.a, libopenblas_ifort.a or libopenblas_omp.a. * Added GEMM_MULTITHREAD_THRESHOLD flag in Makefile.rule. The lib use single thread in GEMM function with small matrices. @@ -229,7 +229,7 @@ x86/x86_64: Version 0.1 alpha2.4 18-Sep-2011 common: - * Fixed a bug about installation. The header file "fblas77.h" + * Fixed a bug about installation. The header file "fblas77.h" works fine now. * Fixed #61 a building bug about setting TARGET and DYNAMIC_ARCH. * Try to handle absolute path of shared library in OSX. (#57) @@ -238,16 +238,16 @@ common: $(PREFIX)/lib x86/x86_64: - * Fixed #58 zdot/xdot SEGFAULT bug with GCC-4.6 on x86. According - to i386 calling convention, The callee should remove the first - hidden parameter.Thank Mr. John for this patch. + * Fixed #58 zdot/xdot SEGFAULT bug with GCC-4.6 on x86. According + to i386 calling convention, The callee should remove the first + hidden parameter.Thank Mr. John for this patch. ==================================================================== Version 0.1 alpha2.3 5-Sep-2011 x86/x86_64: - * Added DTB_ENTRIES into dynamic arch setting parameters. Now, + * Added DTB_ENTRIES into dynamic arch setting parameters. Now, it can read DTB_ENTRIES on runtime. (Refs issue #55 on github) ==================================================================== @@ -255,7 +255,7 @@ Version 0.1 alpha2.2 14-Jul-2011 common: - * Fixed a building bug when DYNAMIC_ARCH=1 & INTERFACE64=1. + * Fixed a building bug when DYNAMIC_ARCH=1 & INTERFACE64=1. (Refs issue #44 on github) ==================================================================== @@ -263,7 +263,7 @@ Version 0.1 alpha2.1 28-Jun-2011 common: - * Stop the build and output the error message when detecting + * Stop the build and output the error message when detecting fortran compiler failed. (Refs issue #42 on github) ==================================================================== @@ -271,16 +271,16 @@ Version 0.1 alpha2 23-Jun-2011 common: - * Fixed blasint undefined bug in file. Other software + * Fixed blasint undefined bug in file. Other software could include this header successfully(Refs issue #13 on github) - * Fixed the SEGFAULT bug on 64 cores. On SMP server, the number - of CPUs or cores should be less than or equal to 64.(Refs issue #14 + * Fixed the SEGFAULT bug on 64 cores. On SMP server, the number + of CPUs or cores should be less than or equal to 64.(Refs issue #14 on github) * Support "void goto_set_num_threads(int num_threads)" and "void openblas_set_num_threads(int num_threads)" when USE_OPENMP=1 - * Added extern "C" to support C++. Thank Tasio for the patch(Refs + * Added extern "C" to support C++. Thank Tasio for the patch(Refs issue #21 on github) - * Provided an error message when the arch is not supported.(Refs + * Provided an error message when the arch is not supported.(Refs issue #19 on github) * Fixed issue #23. Fixed a bug of f_check script about generating link flags. * Added openblas_set_num_threads for Fortran. @@ -298,7 +298,7 @@ x86/x86_64: * Work-around #27 the low performance axpy issue with small imput size & multithreads. MIPS64: - * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. + * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64. * Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2) * Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3) @@ -307,9 +307,9 @@ Version 0.1 alpha1 20-Mar-2011 common: - * Support "make NO_LAPACK=1" to build the library without + * Support "make NO_LAPACK=1" to build the library without LAPACK functions. - * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34. + * Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34. Thank Mr.Ei-ji Nakama providing this patch. (Refs issue #12 on github) * Added DEBUG=1 rule in Makefile.rule to build debug version. * Disable compiling quad precision in reference BLAS library(netlib BLAS). @@ -318,15 +318,15 @@ common: * Imported GotoBLAS2 1.13 BSD version x86/x86_64: - * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue + * On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github) - * Modified ?axpy functions to return same netlib BLAS results + * Modified ?axpy functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #7 on github) - * Modified ?swap functions to return same netlib BLAS results + * Modified ?swap functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #6 on github) - * Modified ?rot functions to return same netlib BLAS results + * Modified ?rot functions to return same netlib BLAS results when incx==0 or incy==0 (Refs issue #4 on github) - * Detect Intel Westmere,Intel Clarkdale and Intel Arrandale + * Detect Intel Westmere,Intel Clarkdale and Intel Arrandale to use Nehalem codes. * Fixed a typo bug about compiling dynamic ARCH library. MIPS64: diff --git a/GotoBLAS_01Readme.txt b/GotoBLAS_01Readme.txt index fdde1e3c7..8635ceb88 100644 --- a/GotoBLAS_01Readme.txt +++ b/GotoBLAS_01Readme.txt @@ -83,7 +83,7 @@ 4. Suported precision Now x86/x86_64 version support 80bit FP precision in addition to -normal double presicion and single precision. Currently only +normal double presicion and single precision. Currently only gfortran supports 80bit FP with "REAL*10". diff --git a/GotoBLAS_02QuickInstall.txt b/GotoBLAS_02QuickInstall.txt index abf380741..330c5857c 100644 --- a/GotoBLAS_02QuickInstall.txt +++ b/GotoBLAS_02QuickInstall.txt @@ -32,9 +32,9 @@ GotoBLAS2 build complete. - OS ... Linux - Architecture ... x86_64 - BINARY ... 64bit + OS ... Linux + Architecture ... x86_64 + BINARY ... 64bit C compiler ... GCC (command line : gcc) Fortran compiler ... PATHSCALE (command line : pathf90) Library Name ... libgoto_barcelonap-r1.27.a (Multi threaded; Max diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index be623d608..f3189ce71 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -56,7 +56,7 @@ 1.6 Q I use OpenMP compiler. How can I use GotoBLAS2 with it? - A Please understand that OpenMP is a compromised method to use + A Please understand that OpenMP is a compromised method to use thread. If you want to use OpenMP based code with GotoBLAS2, you should enable "USE_OPENMP=1" in Makefile.rule. diff --git a/GotoBLAS_05LargePage.txt b/GotoBLAS_05LargePage.txt index fb7de6bba..ec5106fcd 100644 --- a/GotoBLAS_05LargePage.txt +++ b/GotoBLAS_05LargePage.txt @@ -43,7 +43,7 @@ F) Other aarchitecture which doesn't have Large TLB enhancement If you have root permission, please install device driver which - located in drivers/mapper. + located in drivers/mapper. $shell> cd drivers/mapper $shell> make diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt index 804626763..05766e17b 100644 --- a/GotoBLAS_06WeirdPerformance.txt +++ b/GotoBLAS_06WeirdPerformance.txt @@ -4,7 +4,7 @@ probably you created too many threads or process. Basically GotoBLAS assumes that available cores that you specify are exclusively for BLAS computation. Even one small thread/process conflicts with BLAS - threads, performance will become worse. + threads, performance will become worse. The best solution is to reduce your number of threads or insert some synchronization mechanism and suspend your threads until BLAS @@ -19,4 +19,4 @@ Anyway, if you see any weird performance loss, it means your code or -algorithm is not optimal. +algorithm is not optimal. diff --git a/LICENSE b/LICENSE index 1e93a6a73..d15634e8a 100644 --- a/LICENSE +++ b/LICENSE @@ -12,17 +12,17 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile index ad6925d1c..5b0ca0dc5 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ include ./Makefile.system BLASDIRS = interface driver/level2 driver/level3 driver/others ifneq ($(DYNAMIC_ARCH), 1) -BLASDIRS += kernel +BLASDIRS += kernel endif ifdef UTEST_CHECK @@ -153,7 +153,7 @@ endif ifeq ($(EXPRECISION), 1) @echo "#define EXPRECISION">> config_last.h endif -## +## ifeq ($(DYNAMIC_ARCH), 1) @$(MAKE) -C kernel commonlibs || exit 1 @for d in $(DYNAMIC_CORE) ; \ @@ -187,7 +187,7 @@ blas : fi; \ done -hpl : +hpl : ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) for d in $(BLASDIRS) ../laswp exports ; \ do if test -d $$d; then \ @@ -210,7 +210,7 @@ hpl_p : done ifeq ($(NO_LAPACK), 1) -netlib : +netlib : else netlib : lapack_prebuild @@ -255,7 +255,7 @@ endif -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif -large.tgz : +large.tgz : ifndef NOFORTRAN if [ ! -a $< ]; then -wget http://www.netlib.org/lapack/timing/large.tgz; diff --git a/Makefile.alpha b/Makefile.alpha index 2305483d7..bd4f4d58b 100644 --- a/Makefile.alpha +++ b/Makefile.alpha @@ -50,7 +50,7 @@ endif ifndef SMP LIBCXML = -lcxml -lots -lm -LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm +LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm else LIBCXML = -lcxmlp -lots -lm LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm diff --git a/Makefile.ia64 b/Makefile.ia64 index 7ffcd1dbf..cdf3f7c6c 100644 --- a/Makefile.ia64 +++ b/Makefile.ia64 @@ -16,7 +16,7 @@ LIBMLIB = ../../level1/others/libmisc.a -L/opt/intel/fc/ia64/9.1.040/lib -L/opt LIBSCSL = -L/opt/scsl/1.4.1.0/lib -Wl,-rpath,/opt/scsl/1.4.1.0/lib -lscs ifndef SMP -LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm +LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm else LIBATLAS = -L$(HOME)/misc/lib -L/usr/lib/atlas3.6.0p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm endif diff --git a/Makefile.install b/Makefile.install index 9fc8d7ad6..9a6f16dd8 100644 --- a/Makefile.install +++ b/Makefile.install @@ -22,7 +22,7 @@ install : lib.grd @-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR) @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) -#for inc +#for inc @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @@ -50,14 +50,14 @@ ifndef NO_LAPACKE @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h endif -#for install static library +#for install static library ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif -#for install shared library +#for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), Linux) @@ -76,7 +76,7 @@ ifeq ($(OSNAME), NetBSD) @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so endif -ifeq ($(OSNAME), Darwin) +ifeq ($(OSNAME), Darwin) @-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) @-ln -fs $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib diff --git a/Makefile.power b/Makefile.power index c6d6aeb50..7e2b47386 100644 --- a/Makefile.power +++ b/Makefile.power @@ -5,7 +5,7 @@ FLAMEPATH = $(HOME)/flame/lib #ifeq ($(CORE), CELL) #CELL_SDK_ROOT = /opt/IBM/cell-sdk-1.1/sysroot/usr #SPU_CC = spu-gcc -#EXTRALIB += -lspe +#EXTRALIB += -lspe #endif ifeq ($(OSNAME), Linux) @@ -38,7 +38,7 @@ ASFLAGS = -a32 endif endif -# CCOMMON_OPT += -maltivec -mabi=altivec +# CCOMMON_OPT += -maltivec -mabi=altivec LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS) @@ -57,7 +57,7 @@ endif LIBVECLIB = -framework VecLib ifndef SMP -LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm +LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm LIBESSL = -lessl $(ESSLPATH) ../../level1/others/libmisc.a -lm else LIBATLAS = -L/usr/lib/atlas3.7.11p -lptf77blas -latlas -lm -lpthread @@ -73,7 +73,7 @@ endif LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib ifndef SMP -LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm +LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm LIBESSL = -lessl $(ESSLPATH) -lm else LIBATLAS = -L/usr/lib64/atlas3.7.11p -lptf77blas -latlas -lm -lpthread diff --git a/Makefile.rule b/Makefile.rule index 43697f404..fa2384045 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -1,12 +1,12 @@ # -# Beginning of user configuration +# Beginning of user configuration # # This library's version VERSION = 0.2.9 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a -# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library +# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # is libopenblas_$(LIBNAMESUFFIX).so.0. # LIBNAMESUFFIX = omp @@ -57,11 +57,11 @@ VERSION = 0.2.9 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 -# If you only want CBLAS interface without installing Fortran compiler, +# If you only want CBLAS interface without installing Fortran compiler, # please comment it in. # ONLY_CBLAS = 1 -# If you don't need LAPACK, please comment it in. +# If you don't need LAPACK, please comment it in. # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. # NO_LAPACK = 1 @@ -84,7 +84,7 @@ NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. NO_AFFINITY = 1 -# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers +# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # and OS. However, the performance is low. # NO_AVX = 1 @@ -112,8 +112,8 @@ NO_AFFINITY = 1 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 -# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute -# with single thread. You can use this flag to avoid the overhead of multi-threading +# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute +# with single thread. You can use this flag to avoid the overhead of multi-threading # in small matrix sizes. The default value is 4. # GEMM_MULTITHREAD_THRESHOLD = 4 @@ -128,7 +128,7 @@ NO_AFFINITY = 1 # The installation directory. # PREFIX = /opt/OpenBLAS -# Common Optimization Flag; +# Common Optimization Flag; # The default -O2 is enough. # COMMON_OPT = -O2 @@ -143,5 +143,5 @@ COMMON_PROF = -pg # DEBUG = 1 # -# End of user configuration +# End of user configuration # diff --git a/Makefile.sparc b/Makefile.sparc index c58c77e1a..8895b96dd 100644 --- a/Makefile.sparc +++ b/Makefile.sparc @@ -27,7 +27,7 @@ LIBNAME = $(LIBPREFIX).a ifndef SMP LIBCXML = -L/opt/SUNWspro/lib/v9 -LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm +LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm else LIBCXML = -lcxmlp -lots -lm endif diff --git a/Makefile.system b/Makefile.system index 5bdeeaf42..1f9e8618f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -35,7 +35,7 @@ include $(TOPDIR)/$(MAKEFILE_RULE) endif # -# Beginning of system configuration +# Beginning of system configuration # ifndef HOSTCC @@ -99,7 +99,7 @@ endif ifndef GEMM_MULTITHREAD_THRESHOLD GEMM_MULTITHREAD_THRESHOLD=4 endif -GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) +GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) ifeq ($(NO_AVX), 1) GETARCH_FLAGS += -DNO_AVX @@ -230,14 +230,14 @@ GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Majar version > 4 -# It is compatible with MSVC ABI. +# It is compatible with MSVC ABI. CCOMMON_OPT += -DMS_ABI endif ifeq ($(GCCVERSIONGTEQ4), 1) ifeq ($(GCCMINORVERSIONGTEQ7), 1) # GCC Version >=4.7 -# It is compatible with MSVC ABI. +# It is compatible with MSVC ABI. CCOMMON_OPT += -DMS_ABI endif endif @@ -317,7 +317,7 @@ FCOMMON_OPT += -m128bit-long-double endif ifeq ($(C_COMPILER), CLANG) EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION +CCOMMON_OPT += -DEXPRECISION FCOMMON_OPT += -m128bit-long-double endif endif @@ -335,7 +335,7 @@ FCOMMON_OPT += -m128bit-long-double endif ifeq ($(C_COMPILER), CLANG) EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION +CCOMMON_OPT += -DEXPRECISION FCOMMON_OPT += -m128bit-long-double endif endif @@ -455,12 +455,12 @@ endif BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64 endif @@ -530,7 +530,7 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) -EXTRALIB += -lgfortran +EXTRALIB += -lgfortran endif ifdef NO_BINARY_MODE ifeq ($(ARCH), mips64) @@ -657,11 +657,11 @@ FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) FCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3B) FCOMMON_OPT += -loongson3 -static endif @@ -687,11 +687,11 @@ CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3A) CCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3B) CCOMMON_OPT += -loongson3 -static endif @@ -736,7 +736,7 @@ endif ifdef BINARY64 ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) -CCOMMON_OPT += +CCOMMON_OPT += #-DUSE64BITINT endif endif @@ -744,14 +744,14 @@ endif ifeq ($(NEED_PIC), 1) ifeq ($(C_COMPILER), IBM) -CCOMMON_OPT += -qpic=large +CCOMMON_OPT += -qpic=large else -CCOMMON_OPT += -fPIC +CCOMMON_OPT += -fPIC endif ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -pic else -FCOMMON_OPT += -fPIC +FCOMMON_OPT += -fPIC endif endif @@ -929,7 +929,7 @@ LAPACK_FPFLAGS := $(FPFLAGS) endif LAPACK_CFLAGS = $(CFLAGS) -LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H +LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) LAPACK_CFLAGS += -DLAPACK_ILP64 diff --git a/Makefile.tail b/Makefile.tail index 56f8d820c..2adede1a5 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -57,7 +57,7 @@ commonlibs :: $(COMMONOBJS) commonprof :: $(COMMONOBJS_P) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ -quick : +quick : $(MAKE) -C $(TOPDIR) libs bms.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h @@ -386,7 +386,7 @@ kbench_rank_k: kbench_rank_k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS smallbench: smallbench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) -smallbench.mkl: smallbench.$(SUFFIX) +smallbench.mkl: smallbench.$(SUFFIX) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bench.sun: bench.$(SUFFIX) $(OBJS) @@ -410,7 +410,7 @@ bench.acml: bench.$(SUFFIX) $(OBJS) bench.flame: bench.$(SUFFIX) $(OBJS) $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) -kbench.mkl: kbench.$(SUFFIX) $(OBJS) +kbench.mkl: kbench.$(SUFFIX) $(OBJS) $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) bench.mkl: bench.$(SUFFIX) $(OBJS) @@ -537,10 +537,10 @@ params.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) paramd.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h - $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) paramq.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h - $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) paramc.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) @@ -555,10 +555,10 @@ params-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) paramd-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h - $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) paramq-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h - $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) paramc-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) diff --git a/Makefile.x86 b/Makefile.x86 index cd7cc9f90..a6196d365 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -14,7 +14,7 @@ endif # LIBMKL = -L$(MKLPATH)/32 -lmkl_lapack -lmkl_ia32 -lguide -lpthread -lm ifndef SMP -LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm +LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm else LIBATLAS = -L$(ATLAS) -lptf77blas -latlas -lpthread -lg2c -lm endif @@ -50,7 +50,7 @@ LIBSUNPERF = -L/opt/SUNWspro/lib/sse2 -Wl,-R,/opt/SUNWspro/lib/sse2 -lsunperf LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib ifndef SMP -LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm +LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index c8d4b237b..1ba63278a 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -28,7 +28,7 @@ endif ifndef SMP -LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm +LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)64 -llapack -lptcblas -lptf77blas -latlas -lpthread -lm endif diff --git a/README.md b/README.md index 4ae969643..2e85117c9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) ## Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. Please read the documents on OpenBLAS wiki pages . @@ -74,7 +74,7 @@ Please read GotoBLAS_01Readme.txt ## Usages Link with libopenblas.a or -lopenblas for shared library. -### Set the number of threads with environment variables. +### Set the number of threads with environment variables. Examples: @@ -84,7 +84,7 @@ Examples: export GOTO_NUM_THREADS=4 - or + or export OMP_NUM_THREADS=4 @@ -92,7 +92,7 @@ The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. -### Set the number of threads on runtime. +### Set the number of threads on runtime. We provided the below functions to control the number of threads on runtime. @@ -116,12 +116,12 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve * Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. -* The number of CPUs/Cores should less than or equal to 256. +* The number of CPUs/Cores should less than or equal to 256. * On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. ## Contributing -1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. +1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. 1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. 1. Write a test which shows that the bug was fixed or that the feature works as expected. 1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index a40cdd211..1ae3748bb 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -78,29 +78,29 @@ int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; - + if (NULL != tv) { GetSystemTimeAsFileTime(&ft); - + tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; - + /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; + tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } - + return 0; } #endif static __inline double getmflops(int ratio, int m, double secs){ - + double mm = (double)m; double mulflops, addflops; @@ -137,7 +137,7 @@ int MAIN__(int argc, char *argv[]){ struct timeval start, stop; double time1; - argc--;argv++; + argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} @@ -148,17 +148,17 @@ int MAIN__(int argc, char *argv[]){ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - + for(m = from; m <= to; m += step){ - + fprintf(stderr, "M = %6d : ", (int)m); - + for (uplos = 0; uplos < 2; uplos ++) { - + #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { @@ -219,11 +219,11 @@ int MAIN__(int argc, char *argv[]){ fprintf(stderr, "Info = %d\n", info); exit(1); } - + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; maxerr = 0.; - + if (!(uplos & 1)) { for (j = 0; j < m; j++) { for(i = 0; i <= j; i++) { @@ -247,8 +247,8 @@ int MAIN__(int argc, char *argv[]){ } } } - - fprintf(stderr, + + fprintf(stderr, #ifdef XDOUBLE " %Le %10.3f MFlops", maxerr, #else diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 02618599d..98a874208 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -83,22 +83,22 @@ int gettimeofday(struct timeval *tv, void *tz){ FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag; - + if (NULL != tv) { GetSystemTimeAsFileTime(&ft); - + tmpres |= ft.dwHighDateTime; tmpres <<= 32; tmpres |= ft.dwLowDateTime; - + /*converting file time to unix epoch*/ tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; + tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); } - + return 0; } @@ -154,7 +154,7 @@ int MAIN__(int argc, char *argv[]){ struct timeval start, stop; double time1, time2; - argc--;argv++; + argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} @@ -165,15 +165,15 @@ int MAIN__(int argc, char *argv[]){ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - + if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - + #ifdef linux srandom(getpid()); #endif @@ -181,7 +181,7 @@ int MAIN__(int argc, char *argv[]){ fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); for(m = from; m <= to; m += step){ - + fprintf(stderr, " %6d : ", (int)m); for(j = 0; j < m; j++){ @@ -189,9 +189,9 @@ int MAIN__(int argc, char *argv[]){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } - + for (i = 0; i < m * COMPSIZE; ++i) b[i] = 0.; - + for (j = 0; j < m; ++j) { for (i = 0; i < m * COMPSIZE; ++i) { b[i] += a[i + j * m * COMPSIZE]; @@ -208,7 +208,7 @@ int MAIN__(int argc, char *argv[]){ fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; gettimeofday( &start, (struct timezone *)0); @@ -221,7 +221,7 @@ int MAIN__(int argc, char *argv[]){ fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - + time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; maxerr = 0.; @@ -239,7 +239,7 @@ int MAIN__(int argc, char *argv[]){ #endif #endif } - + #ifdef XDOUBLE fprintf(stderr," %Le ", maxerr); #else @@ -247,7 +247,7 @@ int MAIN__(int argc, char *argv[]){ #endif fprintf(stderr, - " %10.2f MFlops %10.2f MFlops %10.2f MFlops\n", + " %10.2f MFlops %10.2f MFlops %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. / 3. * (double)m * (double)m * (double)m / time1 * 1.e-6, COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time2 * 1.e-6, COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m) / (time1 + time2) * 1.e-6); diff --git a/c_check b/c_check index 0828a5bba..94b0bf36e 100644 --- a/c_check +++ b/c_check @@ -180,9 +180,9 @@ $linker_a = ""; { $link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; - + $link =~ s/\-Y\sP\,/\-Y/g; - + @flags = split(/[\s\,\n]/, $link); # remove leading and trailing quotes from each flag. @flags = map {s/^['"]|['"]$//g; $_} @flags; @@ -193,15 +193,15 @@ $linker_a = ""; && ($flags !~ /^-LIST:/) && ($flags !~ /^-LANG:/) ) { - $linker_L .= $flags . " " + $linker_L .= $flags . " " } if ($flags =~ /^\-Y/) { - $linker_L .= "-Wl,". $flags . " " + $linker_L .= "-Wl,". $flags . " " } - + if ( - ($flags =~ /^\-l/) + ($flags =~ /^\-l/) && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) @@ -213,7 +213,7 @@ $linker_a = ""; && ($flags !~ /advapi32/) && ($flags !~ /shell32/) ) { - $linker_l .= $flags . " " + $linker_l .= $flags . " " } $linker_a .= $flags . " " if $flags =~ /\.a$/; @@ -250,9 +250,9 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; if ($os eq "LINUX") { - + # @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); - + # if ($pthread[2] ne "") { # print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; # } else { diff --git a/cblas.h b/cblas.h index 20445858c..841ad6330 100644 --- a/cblas.h +++ b/cblas.h @@ -17,13 +17,13 @@ void goto_set_num_threads(int num_threads); char* openblas_get_config(void); /* Get the parallelization type which is used by OpenBLAS */ -int openblas_get_parallel(void); +int openblas_get_parallel(void); /* OpenBLAS is compiled for sequential use */ #define OPENBLAS_SEQUENTIAL 0 /* OpenBLAS is compiled using normal threading model */ -#define OPENBLAS_THREAD 1 +#define OPENBLAS_THREAD 1 /* OpenBLAS is compiled using OpenMP threading model */ -#define OPENBLAS_OPENMP 2 +#define OPENBLAS_OPENMP 2 /* diff --git a/cblas_noconst.h b/cblas_noconst.h index 002c46b76..1f79e8188 100644 --- a/cblas_noconst.h +++ b/cblas_noconst.h @@ -17,13 +17,13 @@ void goto_set_num_threads(int num_threads); char* openblas_get_config(void); /* Get the parallelization type which is used by OpenBLAS */ -int openblas_get_parallel(void); +int openblas_get_parallel(void); /* OpenBLAS is compiled for sequential use */ #define OPENBLAS_SEQUENTIAL 0 /* OpenBLAS is compiled using normal threading model */ -#define OPENBLAS_THREAD 1 +#define OPENBLAS_THREAD 1 /* OpenBLAS is compiled using OpenMP threading model */ -#define OPENBLAS_OPENMP 2 +#define OPENBLAS_OPENMP 2 #define CBLAS_INDEX size_t diff --git a/common.h b/common.h index 49e2946e7..62d03a7df 100644 --- a/common.h +++ b/common.h @@ -531,7 +531,7 @@ static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ #ifndef UNIT FLOAT ratio, den; - + if ( #ifdef XDOUBLE (fabsl(ar)) >= (fabsl(ai)) @@ -557,7 +557,7 @@ static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ b[0] = ONE; b[1] = ZERO; #endif - + } #endif @@ -693,7 +693,7 @@ extern int gotoblas_profile; #ifdef __cplusplus } - + #endif /* __cplusplus */ #endif diff --git a/common_arm.h b/common_arm.h index 8c9752d9f..130100035 100644 --- a/common_arm.h +++ b/common_arm.h @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -94,7 +94,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ "mov %0 , r3 \n\t" : "=r"(ret), "=r"(address) : "1"(address) - : "memory", "r2" , "r3" + : "memory", "r2" , "r3" ); @@ -143,7 +143,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ .func REALNAME ;\ REALNAME: -#define EPILOGUE +#define EPILOGUE #define PROFCODE diff --git a/common_arm64.h b/common_arm64.h index 2da0d894c..8a66a1702 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -94,7 +94,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ "mov %0 , r3 \n\t" : "=r"(ret), "=r"(address) : "1"(address) - : "memory", "r2" , "r3" + : "memory", "r2" , "r3" ); @@ -143,7 +143,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ .func REALNAME ;\ REALNAME: -#define EPILOGUE +#define EPILOGUE #define PROFCODE diff --git a/common_ia64.h b/common_ia64.h index 79b3c8167..8e92b5992 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -58,10 +58,10 @@ static __inline void blas_lock(volatile unsigned long *address){ unsigned long ret; - + do { while (*address) {YIELDING;}; - + __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n" "cmpxchg4.acq %0=[%2],%1,ar.ccv\n" : "=r"(ret) : "r"(1), "r"(address) diff --git a/common_interface.h b/common_interface.h index 2cc1619ff..6ab3450a0 100644 --- a/common_interface.h +++ b/common_interface.h @@ -238,17 +238,17 @@ void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, +void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, +void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); -void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *, +void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *, +void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(strsv) (char *, char *, char *, blasint *, float *, blasint *, @@ -257,24 +257,24 @@ void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *, +void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *, +void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, +void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(strmv) (char *, char *, char *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *, +void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, +void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *, +void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *, +void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, +void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float *, float *, blasint *); @@ -305,24 +305,24 @@ void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float *, bl void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *, +void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *, +void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); -void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *, +void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *, +void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(sspmv) (char *, blasint *, float *, float *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(dspmv) (char *, blasint *, double *, double *, +void BLASFUNC(dspmv) (char *, blasint *, double *, double *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *, +void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(cspmv) (char *, blasint *, float *, float *, float *, blasint *, float *, float *, blasint *); @@ -344,17 +344,17 @@ void BLASFUNC(zsyr) (char *, blasint *, double *, double *, blasint *, void BLASFUNC(xsyr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(ssyr2) (char *, blasint *, float *, +void BLASFUNC(ssyr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(dsyr2) (char *, blasint *, double *, +void BLASFUNC(dsyr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(qsyr2) (char *, blasint *, xdouble *, +void BLASFUNC(qsyr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(csyr2) (char *, blasint *, float *, +void BLASFUNC(csyr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(zsyr2) (char *, blasint *, double *, +void BLASFUNC(zsyr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(xsyr2) (char *, blasint *, xdouble *, +void BLASFUNC(xsyr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); void BLASFUNC(sspr) (char *, blasint *, float *, float *, blasint *, @@ -370,17 +370,17 @@ void BLASFUNC(zspr) (char *, blasint *, double *, double *, blasint *, void BLASFUNC(xspr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); -void BLASFUNC(sspr2) (char *, blasint *, float *, +void BLASFUNC(sspr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *); -void BLASFUNC(dspr2) (char *, blasint *, double *, +void BLASFUNC(dspr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *); -void BLASFUNC(qspr2) (char *, blasint *, xdouble *, +void BLASFUNC(qspr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); -void BLASFUNC(cspr2) (char *, blasint *, float *, +void BLASFUNC(cspr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *); -void BLASFUNC(zspr2) (char *, blasint *, double *, +void BLASFUNC(zspr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *); -void BLASFUNC(xspr2) (char *, blasint *, xdouble *, +void BLASFUNC(xspr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); void BLASFUNC(cher) (char *, blasint *, float *, float *, blasint *, @@ -394,25 +394,25 @@ void BLASFUNC(chpr) (char *, blasint *, float *, float *, blasint *, float * void BLASFUNC(zhpr) (char *, blasint *, double *, double *, blasint *, double *); void BLASFUNC(xhpr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); -void BLASFUNC(cher2) (char *, blasint *, float *, +void BLASFUNC(cher2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(zher2) (char *, blasint *, double *, +void BLASFUNC(zher2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(xher2) (char *, blasint *, xdouble *, +void BLASFUNC(xher2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(chpr2) (char *, blasint *, float *, +void BLASFUNC(chpr2) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *); -void BLASFUNC(zhpr2) (char *, blasint *, double *, +void BLASFUNC(zhpr2) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *); -void BLASFUNC(xhpr2) (char *, blasint *, xdouble *, +void BLASFUNC(xhpr2) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); -void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *, +void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *, +void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); void BLASFUNC(chpmv) (char *, blasint *, float *, float *, @@ -427,37 +427,37 @@ int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *); int BLASFUNC(cnorm)(char *, blasint *, blasint *, float *, blasint *); int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *); -void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, +void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, +void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); -void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, +void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, +void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); -void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *, +void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, +void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); -void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *, +void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, +void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); -void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *, +void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); -void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *, +void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *, double *, double *, blasint *); -void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, +void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); /* Level 3 routines */ @@ -606,18 +606,18 @@ int BLASFUNC(sgemt)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); -int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *, +int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); -int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *, +int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *, float *, blasint *); int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, double *, blasint *, double*, blasint *); int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint *, float *, blasint *); -int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *, +int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *, double *, blasint *, double*, double *, blasint *, double*, blasint *); int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float *, @@ -776,7 +776,7 @@ void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, do #ifdef __cplusplus } - + #endif /* __cplusplus */ #endif diff --git a/common_level1.h b/common_level1.h index a45eec134..2a1b4f1cf 100644 --- a/common_level1.h +++ b/common_level1.h @@ -54,11 +54,11 @@ double _Complex zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble _Complex xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble _Complex xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float, +int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double, +int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); -int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, +int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int caxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -80,11 +80,11 @@ int ccopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); int zcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); int xcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -int sswap_k (BLASLONG, BLASLONG, BLASLONG, float, +int sswap_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -int dswap_k (BLASLONG, BLASLONG, BLASLONG, double, +int dswap_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double*, BLASLONG); -int qswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, +int qswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG); int cswap_k (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -156,11 +156,11 @@ BLASLONG icmin_k(BLASLONG, float *, BLASLONG); BLASLONG izmin_k(BLASLONG, double *, BLASLONG); BLASLONG ixmin_k(BLASLONG, xdouble *, BLASLONG); -int sscal_k(BLASLONG, BLASLONG, BLASLONG, float, +int sscal_k(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -int dscal_k(BLASLONG, BLASLONG, BLASLONG, double, +int dscal_k(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); -int qscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, +int qscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int cscal_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_level2.h b/common_level2.h index 2ab682a02..640d4a073 100644 --- a/common_level2.h +++ b/common_level2.h @@ -986,24 +986,24 @@ int cnorm_t(BLASLONG, BLASLONG, float *a, BLASLONG); int znorm_n(BLASLONG, BLASLONG, double *a, BLASLONG); int znorm_t(BLASLONG, BLASLONG, double *a, BLASLONG); -void sgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, +void sgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void sgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void dgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); -void dgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, +void dgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); void qgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); -void qgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, +void qgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); void cgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); -void cgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, +void cgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); void cgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); @@ -1052,24 +1052,24 @@ void xgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, void xgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); -int sgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, +int sgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int sgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int dgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); -int dgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, +int dgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); int qgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); -int qgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, +int qgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); int cgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); -int cgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, +int cgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); int cgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); diff --git a/common_level3.h b/common_level3.h index 7e1756e67..0babd45b7 100644 --- a/common_level3.h +++ b/common_level3.h @@ -47,9 +47,9 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); extern "C" { #endif -int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, +int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, +int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int cgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -57,12 +57,12 @@ int zgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); #ifdef EXPRECISION -int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, +int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #else -int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, +int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); diff --git a/common_linux.h b/common_linux.h index afc77b4a2..cab5e5f7b 100644 --- a/common_linux.h +++ b/common_linux.h @@ -75,7 +75,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else -#if defined (LOONGSON3B) +#if defined (LOONGSON3B) #if defined (__64BIT__) return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #else @@ -99,9 +99,9 @@ static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned #endif } -static inline int my_gettid(void) { +static inline int my_gettid(void) { #ifdef SYS_gettid -return syscall(SYS_gettid); +return syscall(SYS_gettid); #else return getpid(); #endif diff --git a/common_mips64.h b/common_mips64.h index d9cdc498f..aa85ff213 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -111,9 +111,9 @@ static inline unsigned int rpcc(void){ ".set pop": "=r"(ret):: "memory"); #else - __asm__ __volatile__(".set push \n" - ".set mips32r2\n" - "rdhwr %0, $30 \n" + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $30 \n" ".set pop" : "=r"(ret) : : "memory"); #endif return ret; @@ -191,13 +191,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define CMPEQ c.eq.s #define CMPLE c.le.s #define CMPLT c.lt.s -#define PLU plu.ps -#define PLL pll.ps -#define PUU puu.ps -#define PUL pul.ps -#define MADPS madd.ps -#define CVTU cvt.s.pu -#define CVTL cvt.s.pl +#define PLU plu.ps +#define PLL pll.ps +#define PUU puu.ps +#define PUL pul.ps +#define MADPS madd.ps +#define CVTU cvt.s.pu +#define CVTL cvt.s.pl #define NEG neg.s #endif @@ -279,9 +279,9 @@ REALNAME: ;\ #if defined(LOONGSON3A) || defined(LOONGSON3B) #define PREFETCHD_(x) ld $0, x -#define PREFETCHD(x) PREFETCHD_(x) +#define PREFETCHD(x) PREFETCHD_(x) #else -#define PREFETCHD(x) +#define PREFETCHD(x) #endif #endif diff --git a/common_param.h b/common_param.h index 14dbc7e7d..863216406 100644 --- a/common_param.h +++ b/common_param.h @@ -87,12 +87,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - + int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - + int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); @@ -114,7 +114,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - + int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); @@ -131,7 +131,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - + int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); @@ -176,12 +176,12 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - + int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - + int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); @@ -203,7 +203,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); - + int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); @@ -220,7 +220,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - + int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); @@ -267,12 +267,12 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - + int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - + int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); @@ -294,7 +294,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - + int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); @@ -311,7 +311,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - + int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); @@ -372,7 +372,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - + int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -381,7 +381,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - + int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); @@ -407,7 +407,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); - + int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); @@ -424,7 +424,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - + int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); @@ -443,7 +443,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); - + int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); @@ -457,21 +457,21 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - + int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - + int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - + int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); @@ -532,7 +532,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - + int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -541,7 +541,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - + int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); @@ -567,7 +567,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); - + int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); @@ -584,7 +584,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - + int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); @@ -603,7 +603,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); - + int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); @@ -617,28 +617,28 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - + int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - + int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); - + int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); - + int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); @@ -694,7 +694,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - + int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); @@ -703,7 +703,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - + int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); @@ -729,7 +729,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); - + int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); @@ -746,7 +746,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - + int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); @@ -765,7 +765,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); - + int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); @@ -779,21 +779,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - + int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); - + int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); - + int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); @@ -902,7 +902,7 @@ extern gotoblas_t *gotoblas; #else -#define DTB_ENTRIES DTB_DEFAULT_ENTRIES +#define DTB_ENTRIES DTB_DEFAULT_ENTRIES #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A #define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B diff --git a/common_power.h b/common_power.h index 34a61539d..f88f527bd 100644 --- a/common_power.h +++ b/common_power.h @@ -114,7 +114,7 @@ static inline unsigned long getstackaddr(void){ __asm__ __volatile__ ("mr %0, 1" : "=r"(addr) : : "memory"); - return addr; + return addr; }; #if defined(OS_LINUX) || defined(OS_AIX) diff --git a/common_reference.h b/common_reference.h index be151e0d6..75bae1faa 100644 --- a/common_reference.h +++ b/common_reference.h @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -65,5 +65,5 @@ void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *); - + #endif diff --git a/common_sparc.h b/common_sparc.h index daa2e49b0..87ef75276 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -130,7 +130,7 @@ static __inline int blas_quickdivide(blasint x, blasint y){ #define FSQRT fsqrts #define FDIV fdivs #endif - + #define HALT prefetch [%g0], 5 #define FMADDS(rs1, rs2, rs3, rd) \ @@ -170,19 +170,19 @@ static __inline int blas_quickdivide(blasint x, blasint y){ .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7e << 5)) #ifndef DOUBLE -#define FCLR(a) FCLRS(a) -#define FONE(a) FONES(a) -#define FMADD(a, b, c, d) FMADDS(a, b, c, d) -#define FMSUB(a, b, c, d) FMSUBS(a, b, c, d) -#define FNMADD(a, b, c, d) FNMADDS(a, b, c, d) -#define FNMSUB(a, b, c, d) FNMSUBS(a, b, c, d) +#define FCLR(a) FCLRS(a) +#define FONE(a) FONES(a) +#define FMADD(a, b, c, d) FMADDS(a, b, c, d) +#define FMSUB(a, b, c, d) FMSUBS(a, b, c, d) +#define FNMADD(a, b, c, d) FNMADDS(a, b, c, d) +#define FNMSUB(a, b, c, d) FNMSUBS(a, b, c, d) #else -#define FCLR(a) FCLRD(a) -#define FONE(a) FONED(a) -#define FMADD(a, b, c, d) FMADDD(a, b, c, d) -#define FMSUB(a, b, c, d) FMSUBD(a, b, c, d) -#define FNMADD(a, b, c, d) FNMADDD(a, b, c, d) -#define FNMSUB(a, b, c, d) FNMSUBD(a, b, c, d) +#define FCLR(a) FCLRD(a) +#define FONE(a) FONED(a) +#define FMADD(a, b, c, d) FMADDD(a, b, c, d) +#define FMSUB(a, b, c, d) FMSUBD(a, b, c, d) +#define FNMADD(a, b, c, d) FNMADDD(a, b, c, d) +#define FNMSUB(a, b, c, d) FNMSUBD(a, b, c, d) #endif #ifndef F_INTERFACE diff --git a/common_thread.h b/common_thread.h index ad386a440..bd964445e 100644 --- a/common_thread.h +++ b/common_thread.h @@ -176,7 +176,7 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer); int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, - void *b, BLASLONG ldb, + void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int threads); int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); @@ -187,14 +187,14 @@ int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*functio int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG); -int trsm_thread(int mode, BLASLONG m, BLASLONG n, +int trsm_thread(int mode, BLASLONG m, BLASLONG n, double alpha_r, double alpha_i, void *a, BLASLONG lda, void *c, BLASLONG ldc, int (*function)(), void *buffer); int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); -int beta_thread(int mode, BLASLONG m, BLASLONG n, +int beta_thread(int mode, BLASLONG m, BLASLONG n, double alpha_r, double alpha_i, void *c, BLASLONG ldc, int (*fuction)()); diff --git a/common_x86.h b/common_x86.h index 5f42843be..f97fd348a 100644 --- a/common_x86.h +++ b/common_x86.h @@ -55,7 +55,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ do { while (*address) {YIELDING;}; - + __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) @@ -70,8 +70,8 @@ static __inline unsigned long long rpcc(void){ unsigned int a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); - - return ((unsigned long long)a + ((unsigned long long)d << 32)); + + return ((unsigned long long)a + ((unsigned long long)d << 32)); }; static __inline unsigned long getstackaddr(void){ @@ -80,7 +80,7 @@ static __inline unsigned long getstackaddr(void){ __asm__ __volatile__ ("mov %%esp, %0" : "=r"(addr) : : "memory"); - return addr; + return addr; }; @@ -365,9 +365,9 @@ REALNAME: #ifndef ALIGN_6 #define ALIGN_6 .align 64 #endif -// ffreep %st(0). +// ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. -// Please check out http://www.sandpile.org/x86/opc_fpu.htm +// Please check out http://www.sandpile.org/x86/opc_fpu.htm #ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif diff --git a/common_x86_64.h b/common_x86_64.h index 39e5a5eb1..0f842ee94 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -60,7 +60,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ do { while (*address) {YIELDING;}; - + __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) @@ -74,8 +74,8 @@ static __inline BLASULONG rpcc(void){ BLASULONG a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); - - return ((BLASULONG)a + ((BLASULONG)d << 32)); + + return ((BLASULONG)a + ((BLASULONG)d << 32)); } #define RPCC64BIT @@ -86,7 +86,7 @@ static __inline BLASULONG getstackaddr(void){ __asm__ __volatile__ ("movq %%rsp, %0" : "=r"(addr) : : "memory"); - return addr; + return addr; } static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ @@ -457,9 +457,9 @@ REALNAME: #define ALIGN_6 .align 64 #endif -// ffreep %st(0). +// ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. -// Please check out http://www.sandpile.org/x86/opc_fpu.htm +// Please check out http://www.sandpile.org/x86/opc_fpu.htm #ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif diff --git a/cpuid.S b/cpuid.S index 3f7bf5f90..851fe34d2 100644 --- a/cpuid.S +++ b/cpuid.S @@ -39,10 +39,10 @@ #if defined(__APPLE__) && defined(__i386__) /* Quick hack for Darwin/x86 */ - + .text .globl _cpuid -_cpuid: +_cpuid: pushl %esi pushl %ebx diff --git a/cpuid_alpha.c b/cpuid_alpha.c index adcc314c3..58dccdefc 100644 --- a/cpuid_alpha.c +++ b/cpuid_alpha.c @@ -50,7 +50,7 @@ int implver(void){ #endif return arch; } - + void get_architecture(void){ printf("ALPHA"); } @@ -67,7 +67,7 @@ void get_cpuconfig(void){ printf("#define EV%d\n", implver() + 4); switch (implver()){ - case 0: + case 0: printf("#define L1_DATA_SIZE 16384\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 2097152\n"); @@ -76,7 +76,7 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 8192\n"); break; - case 1: + case 1: printf("#define L1_DATA_SIZE 16384\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 2097152\n"); @@ -85,7 +85,7 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 8192\n"); break; - case 2: + case 2: printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L2_SIZE 4194304\n"); diff --git a/cpuid_arm.c b/cpuid_arm.c index efd1369b4..809ef3d3a 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -67,7 +67,7 @@ int get_feature(char *search) t = strtok(p," "); while( t = strtok(NULL," ")) - { + { if (!strcmp(t, search)) { return(1); } } @@ -102,7 +102,7 @@ int detect(void) if(p != NULL) { - if (strstr(p, "ARMv7")) + if (strstr(p, "ARMv7")) { if ( get_feature("vfpv4")) return CPU_ARMV7; @@ -116,7 +116,7 @@ int detect(void) } - if (strstr(p, "ARMv6")) + if (strstr(p, "ARMv6")) { if ( get_feature("vfp")) return CPU_ARMV6; @@ -248,7 +248,7 @@ void get_features(void) t = strtok(p," "); while( t = strtok(NULL," ")) - { + { if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; } if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; } if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; } diff --git a/cpuid_ia64.c b/cpuid_ia64.c index d372182a7..e7e200c75 100644 --- a/cpuid_ia64.c +++ b/cpuid_ia64.c @@ -45,7 +45,7 @@ #include #endif -static inline unsigned long cpuid(unsigned long regnum){ +static inline unsigned long cpuid(unsigned long regnum){ unsigned long value; #ifdef __ECC @@ -65,7 +65,7 @@ int get_vendor(void){ cpuid0 = cpuid(0); cpuid1 = cpuid(1); - + *(unsigned long *)(&vendor[0]) = cpuid0; *(unsigned long *)(&vendor[8]) = cpuid1; vendor[17] = (char)0; @@ -79,7 +79,7 @@ int get_cputype(int gettype){ unsigned long cpuid3; cpuid3 = cpuid(3); - + switch (gettype) { case GET_ARCHREV : return BITMASK(cpuid3, 32, 0xff); diff --git a/cpuid_mips.c b/cpuid_mips.c index 45171da5e..fad105747 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ diff --git a/cpuid_power.c b/cpuid_power.c index 9fd9ec9f4..2fc333dd2 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -134,7 +134,7 @@ int detect(void){ if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; - + return CPUTYPE_PPC970; #endif } diff --git a/cpuid_x86.c b/cpuid_x86.c index a15bb11a3..b7355df92 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -93,7 +93,7 @@ void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int * if ((current < start) || (current > stop)) current = start; while ((count > 0) && (idlist[current].id != op)) { - + current ++; if (current > stop) current = start; count --; @@ -134,7 +134,7 @@ int support_avx(){ #ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; - + cpuid(1, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ xgetbv(0, &eax, &edx); @@ -154,7 +154,7 @@ int get_vendor(void){ char vendor[13]; cpuid(0, &eax, &ebx, &ecx, &edx); - + *(int *)(&vendor[0]) = ebx; *(int *)(&vendor[4]) = edx; *(int *)(&vendor[8]) = ecx; @@ -175,7 +175,7 @@ int get_vendor(void){ return VENDOR_UNKNOWN; } - + int get_cputype(int gettype){ int eax, ebx, ecx, edx; int extend_family, family; @@ -184,7 +184,7 @@ int get_cputype(int gettype){ int feature = 0; cpuid(1, &eax, &ebx, &ecx, &edx); - + switch (gettype) { case GET_EXFAMILY : return BITMASK(eax, 20, 0xff); @@ -254,12 +254,12 @@ int get_cputype(int gettype){ } return feature; } - + int get_cacheinfo(int type, cache_info_t *cacheinfo){ int eax, ebx, ecx, edx, cpuid_level; int info[15]; int i; - cache_info_t LC1, LD1, L2, L3, + cache_info_t LC1, LD1, L2, L3, ITB, DTB, LITB, LDTB, L2ITB, L2DTB, L2LITB, L2LDTB; @@ -285,22 +285,22 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ info[ 0] = BITMASK(eax, 8, 0xff); info[ 1] = BITMASK(eax, 16, 0xff); info[ 2] = BITMASK(eax, 24, 0xff); - + info[ 3] = BITMASK(ebx, 0, 0xff); info[ 4] = BITMASK(ebx, 8, 0xff); info[ 5] = BITMASK(ebx, 16, 0xff); info[ 6] = BITMASK(ebx, 24, 0xff); - + info[ 7] = BITMASK(ecx, 0, 0xff); info[ 8] = BITMASK(ecx, 8, 0xff); info[ 9] = BITMASK(ecx, 16, 0xff); info[10] = BITMASK(ecx, 24, 0xff); - + info[11] = BITMASK(edx, 0, 0xff); info[12] = BITMASK(edx, 8, 0xff); info[13] = BITMASK(edx, 16, 0xff); info[14] = BITMASK(edx, 24, 0xff); - + for (i = 0; i < 15; i++){ switch (info[i]){ @@ -866,7 +866,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ LITB.associative = BITMASK(eax, 8, 0xff); if (LITB.associative == 0xff) LITB.associative = 0; LITB.linesize = BITMASK(eax, 0, 0xff); - + DTB.size = 4; DTB.associative = BITMASK(ebx, 24, 0xff); if (DTB.associative == 0xff) DTB.associative = 0; @@ -898,7 +898,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ L2LITB.associative = BITMASK(eax, 8, 0xff); if (L2LITB.associative == 0xff) L2LITB.associative = 0; L2LITB.linesize = BITMASK(eax, 0, 0xff); - + L2DTB.size = 4; L2DTB.associative = BITMASK(ebx, 24, 0xff); if (L2DTB.associative == 0xff) L2DTB.associative = 0; @@ -922,7 +922,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ } switch (type) { - + case CACHE_INFO_L1_I : *cacheinfo = LC1; break; @@ -984,7 +984,7 @@ int get_cpuname(void){ return CPUTYPE_PENTIUM; case 0x6: switch (exmodel) { - case 0: + case 0: switch (model) { case 1: case 3: @@ -1024,8 +1024,8 @@ int get_cpuname(void){ case 2: switch (model) { case 5: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) // Xeon (Clarkdale), 32nm return CPUTYPE_NEHALEM; case 10: @@ -1076,7 +1076,7 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; } - break; + break; } break; case 0x7: @@ -1121,7 +1121,7 @@ int get_cpuname(void){ if(support_avx()) return CPUTYPE_PILEDRIVER; else - return CPUTYPE_BARCELONA; //OS don't support AVX. + return CPUTYPE_BARCELONA; //OS don't support AVX. } break; case 5: @@ -1305,7 +1305,7 @@ static char *lowercpuname[] = { static char *corename[] = { "UNKOWN", - "80486", + "80486", "P5", "P6", "KATMAI", @@ -1333,7 +1333,7 @@ static char *corename[] = { static char *corename_lower[] = { "unknown", - "80486", + "80486", "p5", "p6", "katmai", @@ -1434,8 +1434,8 @@ int get_coretype(void){ case 2: switch (model) { case 5: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) // Xeon (Clarkdale), 32nm return CORE_NEHALEM; case 10: @@ -1485,7 +1485,7 @@ int get_coretype(void){ else return CORE_NEHALEM; } - break; + break; } break; @@ -1499,8 +1499,8 @@ int get_coretype(void){ if (family <= 0x5) return CORE_80486; if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; - else if (exfamily == 5) return CORE_BOBCAT; + if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; + else if (exfamily == 5) return CORE_BOBCAT; else if (exfamily == 6) { switch (model) { case 1: @@ -1513,7 +1513,7 @@ int get_coretype(void){ if(support_avx()) return CORE_PILEDRIVER; else - return CORE_BARCELONA; //OS don't support AVX. + return CORE_BARCELONA; //OS don't support AVX. } }else return CORE_BARCELONA; } @@ -1547,14 +1547,14 @@ void get_cpuconfig(void){ printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); printf("#define L1_CODE_LINESIZE %d\n", info.linesize); } - + get_cacheinfo(CACHE_INFO_L1_D, &info); if (info.size > 0) { printf("#define L1_DATA_SIZE %d\n", info.size * 1024); printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); printf("#define L1_DATA_LINESIZE %d\n", info.linesize); } - + get_cacheinfo(CACHE_INFO_L2, &info); if (info.size > 0) { printf("#define L2_SIZE %d\n", info.size * 1024); @@ -1567,21 +1567,21 @@ void get_cpuconfig(void){ printf("#define L2_LINESIZE 64\n"); } - + get_cacheinfo(CACHE_INFO_L3, &info); if (info.size > 0) { printf("#define L3_SIZE %d\n", info.size * 1024); printf("#define L3_ASSOCIATIVE %d\n", info.associative); printf("#define L3_LINESIZE %d\n", info.linesize); } - + get_cacheinfo(CACHE_INFO_L1_ITB, &info); if (info.size > 0) { printf("#define ITB_SIZE %d\n", info.size * 1024); printf("#define ITB_ASSOCIATIVE %d\n", info.associative); printf("#define ITB_ENTRIES %d\n", info.linesize); } - + get_cacheinfo(CACHE_INFO_L1_DTB, &info); if (info.size > 0) { printf("#define DTB_SIZE %d\n", info.size * 1024); @@ -1591,7 +1591,7 @@ void get_cpuconfig(void){ //fall back for some virtual machines. printf("#define DTB_DEFAULT_ENTRIES 32\n"); } - + features = get_cputype(GET_FEATURE); if (features & HAVE_CMOV ) printf("#define HAVE_CMOV\n"); @@ -1614,7 +1614,7 @@ void get_cpuconfig(void){ if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); if (features & HAVE_128BITFPU) printf("#define HAVE_128BITFPU\n"); if (features & HAVE_FASTMOVU) printf("#define HAVE_FASTMOVU\n"); - + printf("#define NUM_SHAREDCACHE %d\n", get_cputype(GET_NUMSHARE) + 1); printf("#define NUM_CORES %d\n", get_cputype(GET_NUMCORES) + 1); diff --git a/ctest/Makefile b/ctest/Makefile index 099116895..70d3f9712 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -1,5 +1,5 @@ # -# The Makefile compiles c wrappers and testers for CBLAS. +# The Makefile compiles c wrappers and testers for CBLAS. # TOPDIR = .. @@ -27,13 +27,13 @@ ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o -ztestl1o = c_zblas1.o +ztestl1o = c_zblas1.o ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o -all :: all1 all2 all3 +all :: all1 all2 all3 all1: xscblat1 xdcblat1 xccblat1 xzcblat1 ifeq ($(USE_OPENMP), 1) @@ -75,10 +75,10 @@ else endif clean :: - rm -f x* + rm -f x* FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = +CEXTRALIB = # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) @@ -94,7 +94,7 @@ xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) - + # Single complex xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -103,12 +103,12 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) -# Double complex +# Double complex xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) - + include $(TOPDIR)/Makefile.tail diff --git a/ctest/c_c2chke.c b/ctest/c_c2chke.c index 611cc215d..eb5b99008 100644 --- a/ctest/c_c2chke.c +++ b/ctest/c_c2chke.c @@ -26,11 +26,11 @@ void chkxer(void) { void F77_c2chke(char *rout) { char *sf = ( rout ) ; - float A[2] = {0.0,0.0}, - X[2] = {0.0,0.0}, - Y[2] = {0.0,0.0}, + float A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, ALPHA[2] = {0.0,0.0}, - BETA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, RALPHA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; @@ -48,588 +48,588 @@ void F77_c2chke(char *rout) { if (strncmp( sf,"cblas_cgemv",11)==0) { cblas_rout = "cblas_cgemv"; cblas_info = 1; - cblas_cgemv(INVALID, CblasNoTrans, 0, 0, + cblas_cgemv(INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_cgemv(CblasColMajor, INVALID, 0, 0, + cblas_cgemv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0, + cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; - cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; - cblas_cgemv(CblasRowMajor, INVALID, 0, 0, + cblas_cgemv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2, + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; - cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_cgbmv",11)==0) { cblas_rout = "cblas_cgbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; - cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; - cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_chemv",11)==0) { cblas_rout = "cblas_chemv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_chemv(INVALID, CblasUpper, 0, + cblas_chemv(INVALID, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_chemv(CblasColMajor, INVALID, 0, + cblas_chemv(CblasColMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_chemv(CblasColMajor, CblasUpper, INVALID, + cblas_chemv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_chemv(CblasColMajor, CblasUpper, 2, + cblas_chemv(CblasColMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_chemv(CblasColMajor, CblasUpper, 0, + cblas_chemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; - cblas_chemv(CblasColMajor, CblasUpper, 0, + cblas_chemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_chemv(CblasRowMajor, INVALID, 0, + cblas_chemv(CblasRowMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_chemv(CblasRowMajor, CblasUpper, INVALID, + cblas_chemv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_chemv(CblasRowMajor, CblasUpper, 2, + cblas_chemv(CblasRowMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_chemv(CblasRowMajor, CblasUpper, 0, + cblas_chemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; - cblas_chemv(CblasRowMajor, CblasUpper, 0, + cblas_chemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_chbmv",11)==0) { cblas_rout = "cblas_chbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_chbmv(INVALID, CblasUpper, 0, 0, + cblas_chbmv(INVALID, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_chbmv(CblasColMajor, INVALID, 0, 0, + cblas_chbmv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0, + cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID, + cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_chbmv(CblasColMajor, CblasUpper, 0, 1, + cblas_chbmv(CblasColMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; - cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_chbmv(CblasRowMajor, INVALID, 0, 0, + cblas_chbmv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0, + cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID, + cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1, + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; - cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_chpmv",11)==0) { cblas_rout = "cblas_chpmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_chpmv(INVALID, CblasUpper, 0, + cblas_chpmv(INVALID, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_chpmv(CblasColMajor, INVALID, 0, + cblas_chpmv(CblasColMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_chpmv(CblasColMajor, CblasUpper, INVALID, + cblas_chpmv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_chpmv(CblasColMajor, CblasUpper, 0, + cblas_chpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_chpmv(CblasColMajor, CblasUpper, 0, + cblas_chpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_chpmv(CblasRowMajor, INVALID, 0, + cblas_chpmv(CblasRowMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_chpmv(CblasRowMajor, CblasUpper, INVALID, + cblas_chpmv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_chpmv(CblasRowMajor, CblasUpper, 0, + cblas_chpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_chpmv(CblasRowMajor, CblasUpper, 0, + cblas_chpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctrmv",11)==0) { cblas_rout = "cblas_ctrmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans, + cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ctrmv(CblasColMajor, CblasUpper, INVALID, + cblas_ctrmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID, + cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctbmv",11)==0) { cblas_rout = "cblas_ctbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans, + cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ctbmv(CblasColMajor, CblasUpper, INVALID, + cblas_ctbmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID, + cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctpmv",11)==0) { cblas_rout = "cblas_ctpmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans, + cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ctpmv(CblasColMajor, CblasUpper, INVALID, + cblas_ctpmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID, + cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctrsv",11)==0) { cblas_rout = "cblas_ctrsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans, + cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ctrsv(CblasColMajor, CblasUpper, INVALID, + cblas_ctrsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID, + cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctbsv",11)==0) { cblas_rout = "cblas_ctbsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans, + cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ctbsv(CblasColMajor, CblasUpper, INVALID, + cblas_ctbsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID, + cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ctpsv",11)==0) { cblas_rout = "cblas_ctpsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans, + cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ctpsv(CblasColMajor, CblasUpper, INVALID, + cblas_ctpsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID, + cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_cgeru",10)==0) { @@ -818,7 +818,7 @@ void F77_c2chke(char *rout) { cblas_info = 6; RowMajorStrg = FALSE; cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); chkxer(); - } + } if (cblas_ok == TRUE) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c index 29515527b..1c133fb9b 100644 --- a/ctest/c_c3chke.c +++ b/ctest/c_c3chke.c @@ -30,7 +30,7 @@ void F77_c3chke(char * rout) { B[4] = {0.0,0.0,0.0,0.0}, C[4] = {0.0,0.0,0.0,0.0}, ALPHA[2] = {0.0,0.0}, - BETA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, RALPHA = 0.0, RBETA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; @@ -49,15 +49,15 @@ void F77_c3chke(char * rout) { cblas_rout = "cblas_cgemm" ; cblas_info = 1; - cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; - cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; - cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; @@ -272,7 +272,7 @@ void F77_c3chke(char * rout) { cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); - + } else if (strncmp( sf,"cblas_chemm" ,11)==0) { cblas_rout = "cblas_chemm" ; @@ -1696,7 +1696,7 @@ void F77_c3chke(char * rout) { cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); - + } if (cblas_ok == 1 ) diff --git a/ctest/c_cblas1.c b/ctest/c_cblas1.c index f5ffc14bf..d723fd682 100644 --- a/ctest/c_cblas1.c +++ b/ctest/c_cblas1.c @@ -16,21 +16,21 @@ void F77_caxpy(const int *N, const void *alpha, void *X, return; } -void F77_ccopy(const int *N, void *X, const int *incX, +void F77_ccopy(const int *N, void *X, const int *incX, void *Y, const int *incY) { cblas_ccopy(*N, X, *incX, Y, *incY); return; } -void F77_cdotc(const int *N, void *X, const int *incX, +void F77_cdotc(const int *N, void *X, const int *incX, void *Y, const int *incY, void *dotc) { cblas_cdotc_sub(*N, X, *incX, Y, *incY, dotc); return; } -void F77_cdotu(const int *N, void *X, const int *incX, +void F77_cdotu(const int *N, void *X, const int *incX, void *Y, const int *incY,void *dotu) { cblas_cdotu_sub(*N, X, *incX, Y, *incY, dotu); diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c index 7a886ac01..8fbe3b089 100644 --- a/ctest/c_cblas2.c +++ b/ctest/c_cblas2.c @@ -8,9 +8,9 @@ #include "common.h" #include "cblas_test.h" -void F77_cgemv(int *order, char *transp, int *m, int *n, +void F77_cgemv(int *order, char *transp, int *m, int *n, const void *alpha, - CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx, + CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx, const void *beta, void *y, int *incy) { CBLAS_TEST_COMPLEX *A; @@ -38,9 +38,9 @@ void F77_cgemv(int *order, char *transp, int *m, int *n, *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); } -void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, - CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, - CBLAS_TEST_COMPLEX *x, int *incx, +void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy) { CBLAS_TEST_COMPLEX *A; @@ -85,8 +85,8 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, *incx, beta, y, *incy ); } -void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, - CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, +void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, CBLAS_TEST_COMPLEX *a, int *lda){ CBLAS_TEST_COMPLEX *A; @@ -114,8 +114,8 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, cblas_cgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); } -void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, - CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, +void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, CBLAS_TEST_COMPLEX *a, int *lda) { CBLAS_TEST_COMPLEX *A; int i,j,LDA; @@ -165,7 +165,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, free(A); } else if (*order == TEST_COL_MJR) - cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_chemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, @@ -173,7 +173,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, } void F77_chbmv(int *order, char *uplow, int *n, int *k, - CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ @@ -186,7 +186,7 @@ int i,irow,j,jcol,LDA; if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); else { LDA = *k+2; @@ -237,7 +237,7 @@ int i,irow,j,jcol,LDA; } void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, - CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx, + CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ CBLAS_TEST_COMPLEX *A, *AP; @@ -247,7 +247,7 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, beta, y, *incy); else { LDA = *n; @@ -344,7 +344,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn, } } } - cblas_ctbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, + cblas_ctbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } @@ -371,7 +371,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_ctbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x, + cblas_ctbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x, *incx); else { LDA = *k+2; @@ -408,7 +408,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn, } } } - cblas_ctbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, + cblas_ctbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } @@ -674,7 +674,7 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_chpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y, + cblas_chpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y, *incy, ap ); else { LDA = *n; @@ -752,7 +752,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha, LDA = *n+1; A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX )); - for( i=0; i<*n; i++ ) + for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; @@ -786,7 +786,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, LDA = *n+1; A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - for( i=0; i<*n; i++ ) + for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; diff --git a/ctest/c_cblas3.c b/ctest/c_cblas3.c index 9f0da6cc7..0b2f6b966 100644 --- a/ctest/c_cblas3.c +++ b/ctest/c_cblas3.c @@ -12,9 +12,9 @@ #define TEST_ROW_MJR 1 #define UNDEFINED -1 -void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, +void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, - CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { CBLAS_TEST_COMPLEX *A, *B, *C; @@ -134,7 +134,7 @@ void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { @@ -146,10 +146,10 @@ void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, free(C); } else if (*order == TEST_COL_MJR) - cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else - cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, @@ -190,7 +190,7 @@ void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; - cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) @@ -200,15 +200,15 @@ void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, free(C); } else if (*order == TEST_COL_MJR) - cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else - cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, - float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDC; @@ -245,7 +245,7 @@ void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -256,15 +256,15 @@ void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else - cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, - CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { int i,j,LDA,LDC; @@ -301,7 +301,7 @@ void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -312,10 +312,10 @@ void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); else - cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); } void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, @@ -364,7 +364,7 @@ void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -376,10 +376,10 @@ void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else - cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, @@ -428,7 +428,7 @@ void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -440,14 +440,14 @@ void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else - cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, - int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_COMPLEX *A, *B; @@ -487,7 +487,7 @@ void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } - cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { @@ -498,15 +498,15 @@ void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, free(B); } else if (*order == TEST_COL_MJR) - cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else - cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, - int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_COMPLEX *A, *B; @@ -546,7 +546,7 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } - cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { @@ -557,9 +557,9 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, free(B); } else if (*order == TEST_COL_MJR) - cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else - cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f index 545ba4b9f..d934ebb49 100644 --- a/ctest/c_cblat2.f +++ b/ctest/c_cblat2.f @@ -348,13 +348,13 @@ 160 IF (CORDER) THEN CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, - $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 0 ) END IF IF (RORDER) THEN CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, - $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 1 ) END IF GO TO 200 @@ -581,7 +581,7 @@ CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' - ELSE + ELSE CTRANS = 'CblasConjTrans' END IF TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' @@ -684,7 +684,7 @@ * * See what data changed inside subroutines. * -* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N @@ -925,7 +925,7 @@ UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' - ELSE + ELSE CUPLO = ' CblasLower' END IF * @@ -1284,7 +1284,7 @@ UPLO = ICHU( ICU: ICU ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' - ELSE + ELSE CUPLO = ' CblasLower' END IF * @@ -1294,7 +1294,7 @@ CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' - ELSE + ELSE CTRANS = 'CblasConjTrans' END IF * diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f index b03d47916..7d1743b39 100644 --- a/ctest/c_cblat3.f +++ b/ctest/c_cblat3.f @@ -424,7 +424,7 @@ END SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, - $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CGEMM. @@ -600,7 +600,7 @@ IF( REWI ) $ REWIND NTRA CALL CCGEMM( IORDER, TRANSA, TRANSB, M, N, - $ K, ALPHA, AA, LDA, BB, LDB, + $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. @@ -688,7 +688,7 @@ * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE @@ -724,24 +724,24 @@ CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAME CHARACTER*14 CRC, CTA,CTB - + IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' - ELSE + ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' - ELSE + ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB @@ -754,7 +754,7 @@ * SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, - $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests CHEMM and CSYMM. @@ -910,9 +910,9 @@ * Call the subroutine. * IF( TRACE ) - $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, - $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, - $ BETA, LDC) + $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) IF( REWI ) $ REWIND NTRA IF( CONJ )THEN @@ -1015,7 +1015,7 @@ 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, - $ LDB, BETA, LDC) + $ LDB, BETA, LDC) * 120 CONTINUE RETURN @@ -1050,20 +1050,20 @@ CHARACTER*1 SIDE, UPLO CHARACTER*12 SNAME CHARACTER*14 CRC, CS,CU - + IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' - ELSE + ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU @@ -1401,22 +1401,22 @@ CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*12 SNAME CHARACTER*14 CRC, CS, CU, CA, CD - + IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' - ELSE + ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN @@ -1426,7 +1426,7 @@ END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU @@ -1787,22 +1787,22 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA @@ -1821,29 +1821,29 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) - 9994 FORMAT( 10X, 2( I3, ',' ), + 9994 FORMAT( 10X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * @@ -2040,7 +2040,7 @@ IF( REWI ) $ REWIND NTRA CALL CCSYR2K( IORDER, UPLO, TRANS, N, K, - $ ALPHA, AA, LDA, BB, LDB, BETA, + $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * @@ -2240,22 +2240,22 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA @@ -2275,22 +2275,22 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA diff --git a/ctest/c_d2chke.c b/ctest/c_d2chke.c index 23de9a4e7..7cdd04135 100644 --- a/ctest/c_d2chke.c +++ b/ctest/c_d2chke.c @@ -26,9 +26,9 @@ void chkxer(void) { void F77_d2chke(char *rout) { char *sf = ( rout ) ; - double A[2] = {0.0,0.0}, - X[2] = {0.0,0.0}, - Y[2] = {0.0,0.0}, + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, ALPHA=0.0, BETA=0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; @@ -46,588 +46,588 @@ void F77_d2chke(char *rout) { if (strncmp( sf,"cblas_dgemv",11)==0) { cblas_rout = "cblas_dgemv"; cblas_info = 1; - cblas_dgemv(INVALID, CblasNoTrans, 0, 0, + cblas_dgemv(INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dgemv(CblasColMajor, INVALID, 0, 0, + cblas_dgemv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0, + cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; - cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; - cblas_dgemv(CblasRowMajor, INVALID, 0, 0, + cblas_dgemv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2, + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; - cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dgbmv",11)==0) { cblas_rout = "cblas_dgbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; - cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; - cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dsymv",11)==0) { cblas_rout = "cblas_dsymv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dsymv(INVALID, CblasUpper, 0, + cblas_dsymv(INVALID, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dsymv(CblasColMajor, INVALID, 0, + cblas_dsymv(CblasColMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dsymv(CblasColMajor, CblasUpper, INVALID, + cblas_dsymv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_dsymv(CblasColMajor, CblasUpper, 2, + cblas_dsymv(CblasColMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_dsymv(CblasColMajor, CblasUpper, 0, + cblas_dsymv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; - cblas_dsymv(CblasColMajor, CblasUpper, 0, + cblas_dsymv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dsymv(CblasRowMajor, INVALID, 0, + cblas_dsymv(CblasRowMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dsymv(CblasRowMajor, CblasUpper, INVALID, + cblas_dsymv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_dsymv(CblasRowMajor, CblasUpper, 2, + cblas_dsymv(CblasRowMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_dsymv(CblasRowMajor, CblasUpper, 0, + cblas_dsymv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; - cblas_dsymv(CblasRowMajor, CblasUpper, 0, + cblas_dsymv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dsbmv",11)==0) { cblas_rout = "cblas_dsbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dsbmv(INVALID, CblasUpper, 0, 0, + cblas_dsbmv(INVALID, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dsbmv(CblasColMajor, INVALID, 0, 0, + cblas_dsbmv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0, + cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID, + cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1, + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; - cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dsbmv(CblasRowMajor, INVALID, 0, 0, + cblas_dsbmv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0, + cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID, + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1, + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; - cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dspmv",11)==0) { cblas_rout = "cblas_dspmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dspmv(INVALID, CblasUpper, 0, + cblas_dspmv(INVALID, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dspmv(CblasColMajor, INVALID, 0, + cblas_dspmv(CblasColMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dspmv(CblasColMajor, CblasUpper, INVALID, + cblas_dspmv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_dspmv(CblasColMajor, CblasUpper, 0, + cblas_dspmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_dspmv(CblasColMajor, CblasUpper, 0, + cblas_dspmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dspmv(CblasRowMajor, INVALID, 0, + cblas_dspmv(CblasRowMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dspmv(CblasRowMajor, CblasUpper, INVALID, + cblas_dspmv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_dspmv(CblasRowMajor, CblasUpper, 0, + cblas_dspmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_dspmv(CblasRowMajor, CblasUpper, 0, + cblas_dspmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtrmv",11)==0) { cblas_rout = "cblas_dtrmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans, + cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dtrmv(CblasColMajor, CblasUpper, INVALID, + cblas_dtrmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID, + cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtbmv",11)==0) { cblas_rout = "cblas_dtbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans, + cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dtbmv(CblasColMajor, CblasUpper, INVALID, + cblas_dtbmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID, + cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtpmv",11)==0) { cblas_rout = "cblas_dtpmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans, + cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dtpmv(CblasColMajor, CblasUpper, INVALID, + cblas_dtpmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID, + cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtrsv",11)==0) { cblas_rout = "cblas_dtrsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans, + cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dtrsv(CblasColMajor, CblasUpper, INVALID, + cblas_dtrsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID, + cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtbsv",11)==0) { cblas_rout = "cblas_dtbsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans, + cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dtbsv(CblasColMajor, CblasUpper, INVALID, + cblas_dtbsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID, + cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dtpsv",11)==0) { cblas_rout = "cblas_dtpsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans, + cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_dtpsv(CblasColMajor, CblasUpper, INVALID, + cblas_dtpsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID, + cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_dger",10)==0) { @@ -781,7 +781,7 @@ void F77_d2chke(char *rout) { cblas_info = 6; RowMajorStrg = FALSE; cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); chkxer(); - } + } if (cblas_ok == TRUE) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else diff --git a/ctest/c_d3chke.c b/ctest/c_d3chke.c index 1149475ab..700cff28f 100644 --- a/ctest/c_d3chke.c +++ b/ctest/c_d3chke.c @@ -26,9 +26,9 @@ void chkxer(void) { void F77_d3chke(char *rout) { char *sf = ( rout ) ; - double A[2] = {0.0,0.0}, - B[2] = {0.0,0.0}, - C[2] = {0.0,0.0}, + double A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, ALPHA=0.0, BETA=0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; @@ -39,7 +39,7 @@ void F77_d3chke(char *rout) { cblas_xerbla(cblas_info,cblas_rout,""); F77_xerbla(cblas_rout,&cblas_info); } - + cblas_ok = TRUE ; cblas_lerr = PASSED ; @@ -47,15 +47,15 @@ void F77_d3chke(char *rout) { cblas_rout = "cblas_dgemm" ; cblas_info = 1; - cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; - cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; - cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; diff --git a/ctest/c_dblas1.c b/ctest/c_dblas1.c index 2371d338b..764a75cdf 100644 --- a/ctest/c_dblas1.c +++ b/ctest/c_dblas1.c @@ -21,7 +21,7 @@ void F77_daxpy(const int *N, const double *alpha, const double *X, return; } -void F77_dcopy(const int *N, double *X, const int *incX, +void F77_dcopy(const int *N, double *X, const int *incX, double *Y, const int *incY) { cblas_dcopy(*N, X, *incX, Y, *incY); diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c index ed68402d1..423a58748 100644 --- a/ctest/c_dblas2.c +++ b/ctest/c_dblas2.c @@ -8,8 +8,8 @@ #include "common.h" #include "cblas_test.h" -void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, - double *a, int *lda, double *x, int *incx, double *beta, +void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, + double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy ) { double *A; @@ -23,7 +23,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; - cblas_dgemv( CblasRowMajor, trans, + cblas_dgemv( CblasRowMajor, trans, *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } @@ -68,9 +68,9 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; - get_transpose_type(transp,&trans); - get_uplo_type(uplow,&uplo); - get_diag_type(diagn,&diag); + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *n+1; @@ -88,7 +88,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, } } -void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, +void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, int *n, double *a, int *lda, double *x, int *incx ) { double *A; int i,j,LDA; @@ -112,7 +112,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, else cblas_dtrsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); } -void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, +void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy) { double *A; @@ -136,7 +136,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, *beta, y, *incy ); } -void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, +void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, int *incx, double *a, int *lda) { double *A; int i,j,LDA; @@ -160,7 +160,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, cblas_dsyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); } -void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, +void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda) { double *A; int i,j,LDA; @@ -185,7 +185,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, } void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, - double *alpha, double *a, int *lda, double *x, int *incx, + double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy ) { double *A; @@ -213,7 +213,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, for( j=jcol; j<(*n+*kl); j++ ) A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; } - cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, + cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, A, LDA, x, *incx, *beta, y, *incy ); free(A); } @@ -230,9 +230,9 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; - get_transpose_type(transp,&trans); - get_uplo_type(uplow,&uplo); - get_diag_type(diagn,&diag); + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *k+1; @@ -276,9 +276,9 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, enum CBLAS_UPLO uplo; enum CBLAS_DIAG diag; - get_transpose_type(transp,&trans); - get_uplo_type(uplow,&uplo); - get_diag_type(diagn,&diag); + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); if (*order == TEST_ROW_MJR) { LDA = *k+1; @@ -315,7 +315,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, } void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, - double *a, int *lda, double *x, int *incx, double *beta, + double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy) { double *A; int i,j,irow,jcol,LDA; @@ -387,13 +387,13 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, for( j=0; j= 9; i--) if (rout[i] == ' ') rout[i] = '\0'; - + /* We increment *info by 1 since the CBLAS interface adds one more * argument to all level 2 and 3 routines. */ diff --git a/ctest/c_z2chke.c b/ctest/c_z2chke.c index ac6097153..8767b5201 100644 --- a/ctest/c_z2chke.c +++ b/ctest/c_z2chke.c @@ -26,11 +26,11 @@ void chkxer(void) { void F77_z2chke(char *rout) { char *sf = ( rout ) ; - double A[2] = {0.0,0.0}, - X[2] = {0.0,0.0}, - Y[2] = {0.0,0.0}, + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, ALPHA[2] = {0.0,0.0}, - BETA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, RALPHA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; @@ -48,588 +48,588 @@ void F77_z2chke(char *rout) { if (strncmp( sf,"cblas_zgemv",11)==0) { cblas_rout = "cblas_zgemv"; cblas_info = 1; - cblas_zgemv(INVALID, CblasNoTrans, 0, 0, + cblas_zgemv(INVALID, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_zgemv(CblasColMajor, INVALID, 0, 0, + cblas_zgemv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0, + cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; - cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; - cblas_zgemv(CblasRowMajor, INVALID, 0, 0, + cblas_zgemv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2, + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; - cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zgbmv",11)==0) { cblas_rout = "cblas_zgbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = FALSE; - cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 14; RowMajorStrg = TRUE; - cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zhemv",11)==0) { cblas_rout = "cblas_zhemv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_zhemv(INVALID, CblasUpper, 0, + cblas_zhemv(INVALID, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_zhemv(CblasColMajor, INVALID, 0, + cblas_zhemv(CblasColMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_zhemv(CblasColMajor, CblasUpper, INVALID, + cblas_zhemv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_zhemv(CblasColMajor, CblasUpper, 2, + cblas_zhemv(CblasColMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_zhemv(CblasColMajor, CblasUpper, 0, + cblas_zhemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = FALSE; - cblas_zhemv(CblasColMajor, CblasUpper, 0, + cblas_zhemv(CblasColMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_zhemv(CblasRowMajor, INVALID, 0, + cblas_zhemv(CblasRowMajor, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_zhemv(CblasRowMajor, CblasUpper, INVALID, + cblas_zhemv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_zhemv(CblasRowMajor, CblasUpper, 2, + cblas_zhemv(CblasRowMajor, CblasUpper, 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_zhemv(CblasRowMajor, CblasUpper, 0, + cblas_zhemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 11; RowMajorStrg = TRUE; - cblas_zhemv(CblasRowMajor, CblasUpper, 0, + cblas_zhemv(CblasRowMajor, CblasUpper, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zhbmv",11)==0) { cblas_rout = "cblas_zhbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_zhbmv(INVALID, CblasUpper, 0, 0, + cblas_zhbmv(INVALID, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_zhbmv(CblasColMajor, INVALID, 0, 0, + cblas_zhbmv(CblasColMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0, + cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID, + cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1, + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = FALSE; - cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_zhbmv(CblasRowMajor, INVALID, 0, 0, + cblas_zhbmv(CblasRowMajor, INVALID, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0, + cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID, + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1, + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 12; RowMajorStrg = TRUE; - cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zhpmv",11)==0) { cblas_rout = "cblas_zhpmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_zhpmv(INVALID, CblasUpper, 0, + cblas_zhpmv(INVALID, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_zhpmv(CblasColMajor, INVALID, 0, + cblas_zhpmv(CblasColMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_zhpmv(CblasColMajor, CblasUpper, INVALID, + cblas_zhpmv(CblasColMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_zhpmv(CblasColMajor, CblasUpper, 0, + cblas_zhpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_zhpmv(CblasColMajor, CblasUpper, 0, + cblas_zhpmv(CblasColMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_zhpmv(CblasRowMajor, INVALID, 0, + cblas_zhpmv(CblasRowMajor, INVALID, 0, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID, + cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID, ALPHA, A, X, 1, BETA, Y, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 0, BETA, Y, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, ALPHA, A, X, 1, BETA, Y, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztrmv",11)==0) { cblas_rout = "cblas_ztrmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans, + cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ztrmv(CblasColMajor, CblasUpper, INVALID, + cblas_ztrmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID, + cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztbmv",11)==0) { cblas_rout = "cblas_ztbmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans, + cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ztbmv(CblasColMajor, CblasUpper, INVALID, + cblas_ztbmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID, + cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztpmv",11)==0) { cblas_rout = "cblas_ztpmv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans, + cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ztpmv(CblasColMajor, CblasUpper, INVALID, + cblas_ztpmv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID, + cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztrsv",11)==0) { cblas_rout = "cblas_ztrsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans, + cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ztrsv(CblasColMajor, CblasUpper, INVALID, + cblas_ztrsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = FALSE; - cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = FALSE; - cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID, + cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 7; RowMajorStrg = TRUE; - cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 2, A, 1, X, 1 ); chkxer(); cblas_info = 9; RowMajorStrg = TRUE; - cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztbsv",11)==0) { cblas_rout = "cblas_ztbsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans, + cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ztbsv(CblasColMajor, CblasUpper, INVALID, + cblas_ztbsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = FALSE; - cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = FALSE; - cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID, + cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, 0, A, 1, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, 0, A, 1, X, 1 ); chkxer(); cblas_info = 6; RowMajorStrg = TRUE; - cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, INVALID, A, 1, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 1, A, 1, X, 1 ); chkxer(); cblas_info = 10; RowMajorStrg = TRUE; - cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, 0, A, 1, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_ztpsv",11)==0) { cblas_rout = "cblas_ztpsv"; cblas_info = 1; RowMajorStrg = FALSE; - cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans, + cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 2; RowMajorStrg = FALSE; - cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans, + cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = FALSE; - cblas_ztpsv(CblasColMajor, CblasUpper, INVALID, + cblas_ztpsv(CblasColMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = FALSE; - cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = FALSE; - cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = FALSE; - cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); cblas_info = 2; RowMajorStrg = TRUE; - cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans, + cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 3; RowMajorStrg = TRUE; - cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID, + cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID, CblasNonUnit, 0, A, X, 1 ); chkxer(); cblas_info = 4; RowMajorStrg = TRUE; - cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, INVALID, 0, A, X, 1 ); chkxer(); cblas_info = 5; RowMajorStrg = TRUE; - cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, INVALID, A, X, 1 ); chkxer(); cblas_info = 8; RowMajorStrg = TRUE; - cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, 0, A, X, 0 ); chkxer(); } else if (strncmp( sf,"cblas_zgeru",10)==0) { @@ -818,7 +818,7 @@ void F77_z2chke(char *rout) { cblas_info = 6; RowMajorStrg = FALSE; cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); chkxer(); - } + } if (cblas_ok == TRUE) printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); else diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c index b58cb6217..df2513514 100644 --- a/ctest/c_z3chke.c +++ b/ctest/c_z3chke.c @@ -30,7 +30,7 @@ void F77_z3chke(char * rout) { B[4] = {0.0,0.0,0.0,0.0}, C[4] = {0.0,0.0,0.0,0.0}, ALPHA[2] = {0.0,0.0}, - BETA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, RALPHA = 0.0, RBETA = 0.0; extern int cblas_info, cblas_lerr, cblas_ok; extern int RowMajorStrg; @@ -49,15 +49,15 @@ void F77_z3chke(char * rout) { cblas_rout = "cblas_zgemm" ; cblas_info = 1; - cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; - cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; - cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); cblas_info = 1; @@ -272,7 +272,7 @@ void F77_z3chke(char * rout) { cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); - + } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { cblas_rout = "cblas_zhemm" ; @@ -1696,7 +1696,7 @@ void F77_z3chke(char * rout) { cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); chkxer(); - + } if (cblas_ok == 1 ) diff --git a/ctest/c_zblas1.c b/ctest/c_zblas1.c index 0a36f33bd..160ef4ba8 100644 --- a/ctest/c_zblas1.c +++ b/ctest/c_zblas1.c @@ -16,21 +16,21 @@ void F77_zaxpy(const int *N, const void *alpha, void *X, return; } -void F77_zcopy(const int *N, void *X, const int *incX, +void F77_zcopy(const int *N, void *X, const int *incX, void *Y, const int *incY) { cblas_zcopy(*N, X, *incX, Y, *incY); return; } -void F77_zdotc(const int *N, const void *X, const int *incX, +void F77_zdotc(const int *N, const void *X, const int *incX, const void *Y, const int *incY,void *dotc) { cblas_zdotc_sub(*N, X, *incX, Y, *incY, dotc); return; } -void F77_zdotu(const int *N, void *X, const int *incX, +void F77_zdotu(const int *N, void *X, const int *incX, void *Y, const int *incY,void *dotu) { cblas_zdotu_sub(*N, X, *incX, Y, *incY, dotu); diff --git a/ctest/c_zblas2.c b/ctest/c_zblas2.c index 6291abe11..ab1bd79bd 100644 --- a/ctest/c_zblas2.c +++ b/ctest/c_zblas2.c @@ -8,9 +8,9 @@ #include "common.h" #include "cblas_test.h" -void F77_zgemv(int *order, char *transp, int *m, int *n, +void F77_zgemv(int *order, char *transp, int *m, int *n, const void *alpha, - CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx, + CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx, const void *beta, void *y, int *incy) { CBLAS_TEST_ZOMPLEX *A; @@ -38,9 +38,9 @@ void F77_zgemv(int *order, char *transp, int *m, int *n, *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); } -void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, - CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, - CBLAS_TEST_ZOMPLEX *x, int *incx, +void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy) { CBLAS_TEST_ZOMPLEX *A; @@ -85,8 +85,8 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, *incx, beta, y, *incy ); } -void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, - CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, +void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, CBLAS_TEST_ZOMPLEX *a, int *lda){ CBLAS_TEST_ZOMPLEX *A; @@ -114,8 +114,8 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, cblas_zgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); } -void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, - CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, +void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, CBLAS_TEST_ZOMPLEX *a, int *lda) { CBLAS_TEST_ZOMPLEX *A; int i,j,LDA; @@ -165,7 +165,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, free(A); } else if (*order == TEST_COL_MJR) - cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); else cblas_zhemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, @@ -173,7 +173,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, } void F77_zhbmv(int *order, char *uplow, int *n, int *k, - CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ @@ -186,7 +186,7 @@ int i,irow,j,jcol,LDA; if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, *incx, beta, y, *incy ); else { LDA = *k+2; @@ -237,7 +237,7 @@ int i,irow,j,jcol,LDA; } void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, - CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx, + CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ CBLAS_TEST_ZOMPLEX *A, *AP; @@ -247,7 +247,7 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, get_uplo_type(uplow,&uplo); if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, beta, y, *incy); else { LDA = *n; @@ -344,7 +344,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn, } } } - cblas_ztbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, + cblas_ztbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } @@ -371,7 +371,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_ztbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x, + cblas_ztbsv(CblasRowMajor, UNDEFINED, trans, diag, *n, *k, a, *lda, x, *incx); else { LDA = *k+2; @@ -408,7 +408,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn, } } } - cblas_ztbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, + cblas_ztbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); free(A); } @@ -674,7 +674,7 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, if (*order == TEST_ROW_MJR) { if (uplo != CblasUpper && uplo != CblasLower ) - cblas_zhpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y, + cblas_zhpr2( CblasRowMajor, UNDEFINED, *n, alpha, x, *incx, y, *incy, ap ); else { LDA = *n; @@ -752,7 +752,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha, LDA = *n+1; A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX )); - for( i=0; i<*n; i++ ) + for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; @@ -786,7 +786,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, LDA = *n+1; A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); - for( i=0; i<*n; i++ ) + for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c index 7f46365a2..ad744110b 100644 --- a/ctest/c_zblas3.c +++ b/ctest/c_zblas3.c @@ -11,9 +11,9 @@ #define TEST_ROW_MJR 1 #define UNDEFINED -1 -void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, +void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, - CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { CBLAS_TEST_ZOMPLEX *A, *B, *C; @@ -133,7 +133,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { @@ -145,10 +145,10 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, free(C); } else if (*order == TEST_COL_MJR) - cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else - cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, @@ -189,7 +189,7 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) C[i*LDC+j]=c[j*(*ldc)+i]; - cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) @@ -199,15 +199,15 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, free(C); } else if (*order == TEST_COL_MJR) - cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else - cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, - double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDC; @@ -244,7 +244,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -255,15 +255,15 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); else - cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc ); } void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, - CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { int i,j,LDA,LDC; @@ -300,7 +300,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -311,10 +311,10 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); else - cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, c, *ldc ); } void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, @@ -363,7 +363,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, *beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -375,10 +375,10 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); else - cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, *beta, c, *ldc ); } void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, @@ -427,7 +427,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, C[i*LDC+j].real=c[j*(*ldc)+i].real; C[i*LDC+j].imag=c[j*(*ldc)+i].imag; } - cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, B, LDB, beta, C, LDC ); for( j=0; j<*n; j++ ) for( i=0; i<*n; i++ ) { @@ -439,14 +439,14 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, free(C); } else if (*order == TEST_COL_MJR) - cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); else - cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, b, *ldb, beta, c, *ldc ); } void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, - int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_ZOMPLEX *A, *B; @@ -486,7 +486,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } - cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { @@ -497,15 +497,15 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, free(B); } else if (*order == TEST_COL_MJR) - cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else - cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, - int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { int i,j,LDA,LDB; CBLAS_TEST_ZOMPLEX *A, *B; @@ -545,7 +545,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, B[i*LDB+j].real=b[j*(*ldb)+i].real; B[i*LDB+j].imag=b[j*(*ldb)+i].imag; } - cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, A, LDA, B, LDB ); for( j=0; j<*n; j++ ) for( i=0; i<*m; i++ ) { @@ -556,9 +556,9 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, free(B); } else if (*order == TEST_COL_MJR) - cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); else - cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, a, *lda, b, *ldb); } diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f index 236088ff3..5a7d83ff4 100644 --- a/ctest/c_zblat2.f +++ b/ctest/c_zblat2.f @@ -69,7 +69,7 @@ INTEGER NSUBS PARAMETER ( NSUBS = 17 ) COMPLEX*16 ZERO, ONE - PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) @@ -348,13 +348,13 @@ 160 IF (CORDER) THEN CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, - $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 0 ) END IF IF (RORDER) THEN CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, - $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, $ 1 ) END IF GO TO 200 @@ -474,7 +474,7 @@ * * .. Parameters .. COMPLEX*16 ZERO, HALF - PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ) ) DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) @@ -582,7 +582,7 @@ CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' - ELSE + ELSE CTRANS = 'CblasConjTrans' END IF TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' @@ -685,7 +685,7 @@ * * See what data changed inside subroutines. * -* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN ISAME( 1 ) = TRANS.EQ.TRANSS ISAME( 2 ) = MS.EQ.M ISAME( 3 ) = NS.EQ.N @@ -927,7 +927,7 @@ UPLO = ICH( IC: IC ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' - ELSE + ELSE CUPLO = ' CblasLower' END IF * @@ -1287,7 +1287,7 @@ UPLO = ICHU( ICU: ICU ) IF (UPLO.EQ.'U')THEN CUPLO = ' CblasUpper' - ELSE + ELSE CUPLO = ' CblasLower' END IF * @@ -1297,7 +1297,7 @@ CTRANS = ' CblasNoTrans' ELSE IF (TRANS.EQ.'T')THEN CTRANS = ' CblasTrans' - ELSE + ELSE CTRANS = 'CblasConjTrans' END IF * @@ -1569,7 +1569,7 @@ * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE - PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO @@ -1847,7 +1847,7 @@ * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE - PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO @@ -2141,7 +2141,7 @@ * * .. Parameters .. COMPLEX*16 ZERO, HALF, ONE - PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ HALF = ( 0.5D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO @@ -2762,7 +2762,7 @@ * * .. Parameters .. COMPLEX*16 ZERO, ONE - PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) COMPLEX*16 ROGUE PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f index 6e9dbbd8c..93b2b7736 100644 --- a/ctest/c_zblat3.f +++ b/ctest/c_zblat3.f @@ -51,7 +51,7 @@ INTEGER NSUBS PARAMETER ( NSUBS = 9 ) COMPLEX*16 ZERO, ONE - PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), $ ONE = ( 1.0D0, 0.0D0 ) ) DOUBLE PRECISION RZERO, RHALF, RONE PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) @@ -425,7 +425,7 @@ END SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, - $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZGEMM. @@ -601,7 +601,7 @@ IF( REWI ) $ REWIND NTRA CALL CZGEMM( IORDER, TRANSA, TRANSB, M, N, - $ K, ALPHA, AA, LDA, BB, LDB, + $ K, ALPHA, AA, LDA, BB, LDB, $ BETA, CC, LDC ) * * Check if error-exit was taken incorrectly. @@ -689,7 +689,7 @@ * 120 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) * 130 CONTINUE @@ -725,24 +725,24 @@ CHARACTER*1 TRANSA, TRANSB CHARACTER*12 SNAME CHARACTER*14 CRC, CTA,CTB - + IF (TRANSA.EQ.'N')THEN CTA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CTA = ' CblasTrans' - ELSE + ELSE CTA = 'CblasConjTrans' END IF IF (TRANSB.EQ.'N')THEN CTB = ' CblasNoTrans' ELSE IF (TRANSB.EQ.'T')THEN CTB = ' CblasTrans' - ELSE + ELSE CTB = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB @@ -755,7 +755,7 @@ * SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, - $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, $ IORDER ) * * Tests ZHEMM and ZSYMM. @@ -911,9 +911,9 @@ * Call the subroutine. * IF( TRACE ) - $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, - $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, - $ BETA, LDC) + $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) IF( REWI ) $ REWIND NTRA IF( CONJ )THEN @@ -1016,7 +1016,7 @@ 110 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, - $ LDB, BETA, LDC) + $ LDB, BETA, LDC) * 120 CONTINUE RETURN @@ -1051,20 +1051,20 @@ CHARACTER*1 SIDE, UPLO CHARACTER*12 SNAME CHARACTER*14 CRC, CS,CU - + IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' - ELSE + ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU @@ -1402,22 +1402,22 @@ CHARACTER*1 SIDE, UPLO, TRANSA, DIAG CHARACTER*12 SNAME CHARACTER*14 CRC, CS, CU, CA, CD - + IF (SIDE.EQ.'L')THEN CS = ' CblasLeft' - ELSE + ELSE CS = ' CblasRight' END IF IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (DIAG.EQ.'N')THEN @@ -1427,7 +1427,7 @@ END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU @@ -1788,22 +1788,22 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA @@ -1822,29 +1822,29 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) - 9994 FORMAT( 10X, 2( I3, ',' ), + 9994 FORMAT( 10X, 2( I3, ',' ), $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) END * @@ -2041,7 +2041,7 @@ IF( REWI ) $ REWIND NTRA CALL CZSYR2K( IORDER, UPLO, TRANS, N, K, - $ ALPHA, AA, LDA, BB, LDB, BETA, + $ ALPHA, AA, LDA, BB, LDB, BETA, $ CC, LDC ) END IF * @@ -2241,22 +2241,22 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA @@ -2276,22 +2276,22 @@ CHARACTER*1 UPLO, TRANSA CHARACTER*12 SNAME CHARACTER*14 CRC, CU, CA - + IF (UPLO.EQ.'U')THEN CU = ' CblasUpper' - ELSE + ELSE CU = ' CblasLower' END IF IF (TRANSA.EQ.'N')THEN CA = ' CblasNoTrans' ELSE IF (TRANSA.EQ.'T')THEN CA = ' CblasTrans' - ELSE + ELSE CA = 'CblasConjTrans' END IF IF (IORDER.EQ.1)THEN CRC = ' CblasRowMajor' - ELSE + ELSE CRC = ' CblasColMajor' END IF WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA diff --git a/driver/level2/Makefile b/driver/level2/Makefile index 7043e52e1..79c4ca153 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -419,3200 +419,3200 @@ endif all :: -sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c +sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< -sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c +sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< -dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c +dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< -dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c +dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< -qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c +qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< -qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c +qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< -cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c +cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_t.$(SUFFIX) cgbmv_t.$(PSUFFIX) : zgbmv_k.c +cgbmv_t.$(SUFFIX) cgbmv_t.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_r.$(SUFFIX) cgbmv_r.$(PSUFFIX) : zgbmv_k.c +cgbmv_r.$(SUFFIX) cgbmv_r.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_c.$(SUFFIX) cgbmv_c.$(PSUFFIX) : zgbmv_k.c +cgbmv_c.$(SUFFIX) cgbmv_c.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_o.$(SUFFIX) cgbmv_o.$(PSUFFIX) : zgbmv_k.c +cgbmv_o.$(SUFFIX) cgbmv_o.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_u.$(SUFFIX) cgbmv_u.$(PSUFFIX) : zgbmv_k.c +cgbmv_u.$(SUFFIX) cgbmv_u.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c +cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c +cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c +zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_t.$(SUFFIX) zgbmv_t.$(PSUFFIX) : zgbmv_k.c +zgbmv_t.$(SUFFIX) zgbmv_t.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_r.$(SUFFIX) zgbmv_r.$(PSUFFIX) : zgbmv_k.c +zgbmv_r.$(SUFFIX) zgbmv_r.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_c.$(SUFFIX) zgbmv_c.$(PSUFFIX) : zgbmv_k.c +zgbmv_c.$(SUFFIX) zgbmv_c.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_o.$(SUFFIX) zgbmv_o.$(PSUFFIX) : zgbmv_k.c +zgbmv_o.$(SUFFIX) zgbmv_o.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_u.$(SUFFIX) zgbmv_u.$(PSUFFIX) : zgbmv_k.c +zgbmv_u.$(SUFFIX) zgbmv_u.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c +zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c +zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c +xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_t.$(SUFFIX) xgbmv_t.$(PSUFFIX) : zgbmv_k.c +xgbmv_t.$(SUFFIX) xgbmv_t.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_r.$(SUFFIX) xgbmv_r.$(PSUFFIX) : zgbmv_k.c +xgbmv_r.$(SUFFIX) xgbmv_r.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_c.$(SUFFIX) xgbmv_c.$(PSUFFIX) : zgbmv_k.c +xgbmv_c.$(SUFFIX) xgbmv_c.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_o.$(SUFFIX) xgbmv_o.$(PSUFFIX) : zgbmv_k.c +xgbmv_o.$(SUFFIX) xgbmv_o.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_u.$(SUFFIX) xgbmv_u.$(PSUFFIX) : zgbmv_k.c +xgbmv_u.$(SUFFIX) xgbmv_u.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c +xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c +xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c +sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< -sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c +sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< -dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c +dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< -dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c +dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< -qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c +qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< -qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c +qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< -cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_thread_t.$(SUFFIX) cgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_t.$(SUFFIX) cgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_thread_r.$(SUFFIX) cgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_r.$(SUFFIX) cgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_thread_c.$(SUFFIX) cgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_c.$(SUFFIX) cgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_thread_o.$(SUFFIX) cgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_o.$(SUFFIX) cgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_thread_u.$(SUFFIX) cgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_u.$(SUFFIX) cgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c +cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_t.$(SUFFIX) zgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_t.$(SUFFIX) zgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_r.$(SUFFIX) zgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_r.$(SUFFIX) zgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_c.$(SUFFIX) zgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_c.$(SUFFIX) zgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_o.$(SUFFIX) zgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_o.$(SUFFIX) zgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_u.$(SUFFIX) zgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_u.$(SUFFIX) zgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c +zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_t.$(SUFFIX) xgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_t.$(SUFFIX) xgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_r.$(SUFFIX) xgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_r.$(SUFFIX) xgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_c.$(SUFFIX) xgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_c.$(SUFFIX) xgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_o.$(SUFFIX) xgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_o.$(SUFFIX) xgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_u.$(SUFFIX) xgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_u.$(SUFFIX) xgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c +xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< -sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h +sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) -sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h +sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) -dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h +dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) -dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h +dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) -qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h +qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) -qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h +qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) -cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) -cgemv_thread_t.$(SUFFIX) cgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_t.$(SUFFIX) cgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) -cgemv_thread_r.$(SUFFIX) cgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_r.$(SUFFIX) cgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) -cgemv_thread_c.$(SUFFIX) cgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_c.$(SUFFIX) cgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) -cgemv_thread_o.$(SUFFIX) cgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_o.$(SUFFIX) cgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) -cgemv_thread_u.$(SUFFIX) cgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_u.$(SUFFIX) cgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) -cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) -cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h +cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) -zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) -zgemv_thread_t.$(SUFFIX) zgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_t.$(SUFFIX) zgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) -zgemv_thread_r.$(SUFFIX) zgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_r.$(SUFFIX) zgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) -zgemv_thread_c.$(SUFFIX) zgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_c.$(SUFFIX) zgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) -zgemv_thread_o.$(SUFFIX) zgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_o.$(SUFFIX) zgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) -zgemv_thread_u.$(SUFFIX) zgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_u.$(SUFFIX) zgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) -zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) -zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h +zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) -xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) -xgemv_thread_t.$(SUFFIX) xgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_t.$(SUFFIX) xgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) -xgemv_thread_r.$(SUFFIX) xgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_r.$(SUFFIX) xgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) -xgemv_thread_c.$(SUFFIX) xgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_c.$(SUFFIX) xgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) -xgemv_thread_o.$(SUFFIX) xgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_o.$(SUFFIX) xgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) -xgemv_thread_u.$(SUFFIX) xgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_u.$(SUFFIX) xgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) -xgemv_thread_s.$(SUFFIX) xgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_s.$(SUFFIX) xgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) -xgemv_thread_d.$(SUFFIX) xgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h +xgemv_thread_d.$(SUFFIX) xgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) -sger_thread.$(SUFFIX) sger_thread.$(PSUFFIX) : ger_thread.c ../../common.h +sger_thread.$(SUFFIX) sger_thread.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) -dger_thread.$(SUFFIX) dger_thread.$(PSUFFIX) : ger_thread.c ../../common.h +dger_thread.$(SUFFIX) dger_thread.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) -qger_thread.$(SUFFIX) qger_thread.$(PSUFFIX) : ger_thread.c ../../common.h +qger_thread.$(SUFFIX) qger_thread.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) -cger_thread_U.$(SUFFIX) cger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h +cger_thread_U.$(SUFFIX) cger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) -cger_thread_C.$(SUFFIX) cger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h +cger_thread_C.$(SUFFIX) cger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -UXCONJ $< -o $(@F) -cger_thread_V.$(SUFFIX) cger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h +cger_thread_V.$(SUFFIX) cger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -DXCONJ $< -o $(@F) -cger_thread_D.$(SUFFIX) cger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h +cger_thread_D.$(SUFFIX) cger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -DXCONJ $< -o $(@F) -zger_thread_U.$(SUFFIX) zger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h +zger_thread_U.$(SUFFIX) zger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) -zger_thread_C.$(SUFFIX) zger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h +zger_thread_C.$(SUFFIX) zger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -UXCONJ $< -o $(@F) -zger_thread_V.$(SUFFIX) zger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h +zger_thread_V.$(SUFFIX) zger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -DXCONJ $< -o $(@F) -zger_thread_D.$(SUFFIX) zger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h +zger_thread_D.$(SUFFIX) zger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -DXCONJ $< -o $(@F) -xger_thread_U.$(SUFFIX) xger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h +xger_thread_U.$(SUFFIX) xger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) -xger_thread_C.$(SUFFIX) xger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h +xger_thread_C.$(SUFFIX) xger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -UXCONJ $< -o $(@F) -xger_thread_V.$(SUFFIX) xger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h +xger_thread_V.$(SUFFIX) xger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -DXCONJ $< -o $(@F) -xger_thread_D.$(SUFFIX) xger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h +xger_thread_D.$(SUFFIX) xger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -DXCONJ $< -o $(@F) -ssymv_thread_U.$(SUFFIX) ssymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +ssymv_thread_U.$(SUFFIX) ssymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -ssymv_thread_L.$(SUFFIX) ssymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +ssymv_thread_L.$(SUFFIX) ssymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dsymv_thread_U.$(SUFFIX) dsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +dsymv_thread_U.$(SUFFIX) dsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dsymv_thread_L.$(SUFFIX) dsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +dsymv_thread_L.$(SUFFIX) dsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qsymv_thread_U.$(SUFFIX) qsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +qsymv_thread_U.$(SUFFIX) qsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qsymv_thread_L.$(SUFFIX) qsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +qsymv_thread_L.$(SUFFIX) qsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -csymv_thread_U.$(SUFFIX) csymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +csymv_thread_U.$(SUFFIX) csymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -csymv_thread_L.$(SUFFIX) csymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +csymv_thread_L.$(SUFFIX) csymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zsymv_thread_U.$(SUFFIX) zsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +zsymv_thread_U.$(SUFFIX) zsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zsymv_thread_L.$(SUFFIX) zsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +zsymv_thread_L.$(SUFFIX) zsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xsymv_thread_U.$(SUFFIX) xsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +xsymv_thread_U.$(SUFFIX) xsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xsymv_thread_L.$(SUFFIX) xsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +xsymv_thread_L.$(SUFFIX) xsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -chemv_thread_U.$(SUFFIX) chemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +chemv_thread_U.$(SUFFIX) chemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) -chemv_thread_L.$(SUFFIX) chemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +chemv_thread_L.$(SUFFIX) chemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) -chemv_thread_V.$(SUFFIX) chemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h +chemv_thread_V.$(SUFFIX) chemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -chemv_thread_M.$(SUFFIX) chemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h +chemv_thread_M.$(SUFFIX) chemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zhemv_thread_U.$(SUFFIX) zhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +zhemv_thread_U.$(SUFFIX) zhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) -zhemv_thread_L.$(SUFFIX) zhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +zhemv_thread_L.$(SUFFIX) zhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) -zhemv_thread_V.$(SUFFIX) zhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h +zhemv_thread_V.$(SUFFIX) zhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zhemv_thread_M.$(SUFFIX) zhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h +zhemv_thread_M.$(SUFFIX) zhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xhemv_thread_U.$(SUFFIX) xhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h +xhemv_thread_U.$(SUFFIX) xhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) -xhemv_thread_L.$(SUFFIX) xhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h +xhemv_thread_L.$(SUFFIX) xhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) -xhemv_thread_V.$(SUFFIX) xhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h +xhemv_thread_V.$(SUFFIX) xhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xhemv_thread_M.$(SUFFIX) xhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h +xhemv_thread_M.$(SUFFIX) xhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -ssyr_thread_U.$(SUFFIX) ssyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +ssyr_thread_U.$(SUFFIX) ssyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -ssyr_thread_L.$(SUFFIX) ssyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +ssyr_thread_L.$(SUFFIX) ssyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dsyr_thread_U.$(SUFFIX) dsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +dsyr_thread_U.$(SUFFIX) dsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dsyr_thread_L.$(SUFFIX) dsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +dsyr_thread_L.$(SUFFIX) dsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qsyr_thread_U.$(SUFFIX) qsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +qsyr_thread_U.$(SUFFIX) qsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qsyr_thread_L.$(SUFFIX) qsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +qsyr_thread_L.$(SUFFIX) qsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -csyr_thread_U.$(SUFFIX) csyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +csyr_thread_U.$(SUFFIX) csyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -csyr_thread_L.$(SUFFIX) csyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +csyr_thread_L.$(SUFFIX) csyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zsyr_thread_U.$(SUFFIX) zsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +zsyr_thread_U.$(SUFFIX) zsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zsyr_thread_L.$(SUFFIX) zsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +zsyr_thread_L.$(SUFFIX) zsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xsyr_thread_U.$(SUFFIX) xsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +xsyr_thread_U.$(SUFFIX) xsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xsyr_thread_L.$(SUFFIX) xsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +xsyr_thread_L.$(SUFFIX) xsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cher_thread_U.$(SUFFIX) cher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +cher_thread_U.$(SUFFIX) cher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) -cher_thread_L.$(SUFFIX) cher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +cher_thread_L.$(SUFFIX) cher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) -cher_thread_V.$(SUFFIX) cher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h +cher_thread_V.$(SUFFIX) cher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) -cher_thread_M.$(SUFFIX) cher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h +cher_thread_M.$(SUFFIX) cher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) -zher_thread_U.$(SUFFIX) zher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +zher_thread_U.$(SUFFIX) zher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) -zher_thread_L.$(SUFFIX) zher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +zher_thread_L.$(SUFFIX) zher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) -zher_thread_V.$(SUFFIX) zher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h +zher_thread_V.$(SUFFIX) zher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) -zher_thread_M.$(SUFFIX) zher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h +zher_thread_M.$(SUFFIX) zher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) -xher_thread_U.$(SUFFIX) xher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h +xher_thread_U.$(SUFFIX) xher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) -xher_thread_L.$(SUFFIX) xher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h +xher_thread_L.$(SUFFIX) xher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) -xher_thread_V.$(SUFFIX) xher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h +xher_thread_V.$(SUFFIX) xher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) -xher_thread_M.$(SUFFIX) xher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h +xher_thread_M.$(SUFFIX) xher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) -ssyr2_thread_U.$(SUFFIX) ssyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +ssyr2_thread_U.$(SUFFIX) ssyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -ssyr2_thread_L.$(SUFFIX) ssyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +ssyr2_thread_L.$(SUFFIX) ssyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dsyr2_thread_U.$(SUFFIX) dsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +dsyr2_thread_U.$(SUFFIX) dsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dsyr2_thread_L.$(SUFFIX) dsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +dsyr2_thread_L.$(SUFFIX) dsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qsyr2_thread_U.$(SUFFIX) qsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +qsyr2_thread_U.$(SUFFIX) qsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qsyr2_thread_L.$(SUFFIX) qsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +qsyr2_thread_L.$(SUFFIX) qsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -csyr2_thread_U.$(SUFFIX) csyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +csyr2_thread_U.$(SUFFIX) csyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -csyr2_thread_L.$(SUFFIX) csyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +csyr2_thread_L.$(SUFFIX) csyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zsyr2_thread_U.$(SUFFIX) zsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +zsyr2_thread_U.$(SUFFIX) zsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zsyr2_thread_L.$(SUFFIX) zsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +zsyr2_thread_L.$(SUFFIX) zsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xsyr2_thread_U.$(SUFFIX) xsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +xsyr2_thread_U.$(SUFFIX) xsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xsyr2_thread_L.$(SUFFIX) xsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +xsyr2_thread_L.$(SUFFIX) xsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cher2_thread_U.$(SUFFIX) cher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +cher2_thread_U.$(SUFFIX) cher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) -cher2_thread_L.$(SUFFIX) cher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +cher2_thread_L.$(SUFFIX) cher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) -cher2_thread_V.$(SUFFIX) cher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h +cher2_thread_V.$(SUFFIX) cher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) -cher2_thread_M.$(SUFFIX) cher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h +cher2_thread_M.$(SUFFIX) cher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) -zher2_thread_U.$(SUFFIX) zher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +zher2_thread_U.$(SUFFIX) zher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) -zher2_thread_L.$(SUFFIX) zher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +zher2_thread_L.$(SUFFIX) zher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) -zher2_thread_V.$(SUFFIX) zher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h +zher2_thread_V.$(SUFFIX) zher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) -zher2_thread_M.$(SUFFIX) zher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h +zher2_thread_M.$(SUFFIX) zher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) -xher2_thread_U.$(SUFFIX) xher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h +xher2_thread_U.$(SUFFIX) xher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) -xher2_thread_L.$(SUFFIX) xher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h +xher2_thread_L.$(SUFFIX) xher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) -xher2_thread_V.$(SUFFIX) xher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h +xher2_thread_V.$(SUFFIX) xher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) -xher2_thread_M.$(SUFFIX) xher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h +xher2_thread_M.$(SUFFIX) xher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) -chbmv_U.$(SUFFIX) chbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h +chbmv_U.$(SUFFIX) chbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -chbmv_L.$(SUFFIX) chbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h +chbmv_L.$(SUFFIX) chbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -chbmv_V.$(SUFFIX) chbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h +chbmv_V.$(SUFFIX) chbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -chbmv_M.$(SUFFIX) chbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h +chbmv_M.$(SUFFIX) chbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zhbmv_U.$(SUFFIX) zhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h +zhbmv_U.$(SUFFIX) zhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zhbmv_L.$(SUFFIX) zhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h +zhbmv_L.$(SUFFIX) zhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -zhbmv_V.$(SUFFIX) zhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h +zhbmv_V.$(SUFFIX) zhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zhbmv_M.$(SUFFIX) zhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h +zhbmv_M.$(SUFFIX) zhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xhbmv_U.$(SUFFIX) xhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h +xhbmv_U.$(SUFFIX) xhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xhbmv_L.$(SUFFIX) xhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h +xhbmv_L.$(SUFFIX) xhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -xhbmv_V.$(SUFFIX) xhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h +xhbmv_V.$(SUFFIX) xhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xhbmv_M.$(SUFFIX) xhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h +xhbmv_M.$(SUFFIX) xhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -chbmv_thread_U.$(SUFFIX) chbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +chbmv_thread_U.$(SUFFIX) chbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) -chbmv_thread_L.$(SUFFIX) chbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +chbmv_thread_L.$(SUFFIX) chbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) -chbmv_thread_V.$(SUFFIX) chbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h +chbmv_thread_V.$(SUFFIX) chbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -chbmv_thread_M.$(SUFFIX) chbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h +chbmv_thread_M.$(SUFFIX) chbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zhbmv_thread_U.$(SUFFIX) zhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +zhbmv_thread_U.$(SUFFIX) zhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) -zhbmv_thread_L.$(SUFFIX) zhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +zhbmv_thread_L.$(SUFFIX) zhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) -zhbmv_thread_V.$(SUFFIX) zhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h +zhbmv_thread_V.$(SUFFIX) zhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zhbmv_thread_M.$(SUFFIX) zhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h +zhbmv_thread_M.$(SUFFIX) zhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xhbmv_thread_U.$(SUFFIX) xhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +xhbmv_thread_U.$(SUFFIX) xhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) -xhbmv_thread_L.$(SUFFIX) xhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +xhbmv_thread_L.$(SUFFIX) xhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) -xhbmv_thread_V.$(SUFFIX) xhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h +xhbmv_thread_V.$(SUFFIX) xhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xhbmv_thread_M.$(SUFFIX) xhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h +xhbmv_thread_M.$(SUFFIX) xhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -cher_U.$(SUFFIX) cher_U.$(PSUFFIX) : zher_k.c ../../common.h +cher_U.$(SUFFIX) cher_U.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) -cher_L.$(SUFFIX) cher_L.$(PSUFFIX) : zher_k.c ../../common.h +cher_L.$(SUFFIX) cher_L.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) -cher_V.$(SUFFIX) cher_V.$(PSUFFIX) : zher_k.c ../../common.h +cher_V.$(SUFFIX) cher_V.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -cher_M.$(SUFFIX) cher_M.$(PSUFFIX) : zher_k.c ../../common.h +cher_M.$(SUFFIX) cher_M.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zher_U.$(SUFFIX) zher_U.$(PSUFFIX) : zher_k.c ../../common.h +zher_U.$(SUFFIX) zher_U.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) -zher_L.$(SUFFIX) zher_L.$(PSUFFIX) : zher_k.c ../../common.h +zher_L.$(SUFFIX) zher_L.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) -zher_V.$(SUFFIX) zher_V.$(PSUFFIX) : zher_k.c ../../common.h +zher_V.$(SUFFIX) zher_V.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zher_M.$(SUFFIX) zher_M.$(PSUFFIX) : zher_k.c ../../common.h +zher_M.$(SUFFIX) zher_M.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xher_U.$(SUFFIX) xher_U.$(PSUFFIX) : zher_k.c ../../common.h +xher_U.$(SUFFIX) xher_U.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) -xher_L.$(SUFFIX) xher_L.$(PSUFFIX) : zher_k.c ../../common.h +xher_L.$(SUFFIX) xher_L.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) -xher_V.$(SUFFIX) xher_V.$(PSUFFIX) : zher_k.c ../../common.h +xher_V.$(SUFFIX) xher_V.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xher_M.$(SUFFIX) xher_M.$(PSUFFIX) : zher_k.c ../../common.h +xher_M.$(SUFFIX) xher_M.$(PSUFFIX) : zher_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -cher2_U.$(SUFFIX) cher2_U.$(PSUFFIX) : zher2_k.c ../../param.h +cher2_U.$(SUFFIX) cher2_U.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) -cher2_L.$(SUFFIX) cher2_L.$(PSUFFIX) : zher2_k.c ../../param.h +cher2_L.$(SUFFIX) cher2_L.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) -cher2_V.$(SUFFIX) cher2_V.$(PSUFFIX) : zher2_k.c ../../param.h +cher2_V.$(SUFFIX) cher2_V.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -cher2_M.$(SUFFIX) cher2_M.$(PSUFFIX) : zher2_k.c ../../param.h +cher2_M.$(SUFFIX) cher2_M.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) -zher2_U.$(SUFFIX) zher2_U.$(PSUFFIX) : zher2_k.c ../../param.h +zher2_U.$(SUFFIX) zher2_U.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) -zher2_L.$(SUFFIX) zher2_L.$(PSUFFIX) : zher2_k.c ../../param.h +zher2_L.$(SUFFIX) zher2_L.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) -zher2_V.$(SUFFIX) zher2_V.$(PSUFFIX) : zher2_k.c ../../param.h +zher2_V.$(SUFFIX) zher2_V.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -zher2_M.$(SUFFIX) zher2_M.$(PSUFFIX) : zher2_k.c ../../param.h +zher2_M.$(SUFFIX) zher2_M.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) -xher2_U.$(SUFFIX) xher2_U.$(PSUFFIX) : zher2_k.c ../../param.h +xher2_U.$(SUFFIX) xher2_U.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) -xher2_L.$(SUFFIX) xher2_L.$(PSUFFIX) : zher2_k.c ../../param.h +xher2_L.$(SUFFIX) xher2_L.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) -xher2_V.$(SUFFIX) xher2_V.$(PSUFFIX) : zher2_k.c ../../param.h +xher2_V.$(SUFFIX) xher2_V.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -DHEMVREV -o $(@F) -xher2_M.$(SUFFIX) xher2_M.$(PSUFFIX) : zher2_k.c ../../param.h +xher2_M.$(SUFFIX) xher2_M.$(PSUFFIX) : zher2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) -chpmv_U.$(SUFFIX) chpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h +chpmv_U.$(SUFFIX) chpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -chpmv_L.$(SUFFIX) chpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h +chpmv_L.$(SUFFIX) chpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -chpmv_V.$(SUFFIX) chpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h +chpmv_V.$(SUFFIX) chpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -chpmv_M.$(SUFFIX) chpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h +chpmv_M.$(SUFFIX) chpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zhpmv_U.$(SUFFIX) zhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h +zhpmv_U.$(SUFFIX) zhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zhpmv_L.$(SUFFIX) zhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h +zhpmv_L.$(SUFFIX) zhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -zhpmv_V.$(SUFFIX) zhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h +zhpmv_V.$(SUFFIX) zhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zhpmv_M.$(SUFFIX) zhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h +zhpmv_M.$(SUFFIX) zhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xhpmv_U.$(SUFFIX) xhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h +xhpmv_U.$(SUFFIX) xhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xhpmv_L.$(SUFFIX) xhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h +xhpmv_L.$(SUFFIX) xhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -xhpmv_V.$(SUFFIX) xhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h +xhpmv_V.$(SUFFIX) xhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xhpmv_M.$(SUFFIX) xhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h +xhpmv_M.$(SUFFIX) xhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -chpmv_thread_U.$(SUFFIX) chpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +chpmv_thread_U.$(SUFFIX) chpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) -chpmv_thread_L.$(SUFFIX) chpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +chpmv_thread_L.$(SUFFIX) chpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) -chpmv_thread_V.$(SUFFIX) chpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h +chpmv_thread_V.$(SUFFIX) chpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -chpmv_thread_M.$(SUFFIX) chpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h +chpmv_thread_M.$(SUFFIX) chpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zhpmv_thread_U.$(SUFFIX) zhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +zhpmv_thread_U.$(SUFFIX) zhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) -zhpmv_thread_L.$(SUFFIX) zhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +zhpmv_thread_L.$(SUFFIX) zhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) -zhpmv_thread_V.$(SUFFIX) zhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h +zhpmv_thread_V.$(SUFFIX) zhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zhpmv_thread_M.$(SUFFIX) zhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h +zhpmv_thread_M.$(SUFFIX) zhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xhpmv_thread_U.$(SUFFIX) xhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +xhpmv_thread_U.$(SUFFIX) xhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) -xhpmv_thread_L.$(SUFFIX) xhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +xhpmv_thread_L.$(SUFFIX) xhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) -xhpmv_thread_V.$(SUFFIX) xhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h +xhpmv_thread_V.$(SUFFIX) xhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xhpmv_thread_M.$(SUFFIX) xhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h +xhpmv_thread_M.$(SUFFIX) xhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -chpr_U.$(SUFFIX) chpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h +chpr_U.$(SUFFIX) chpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) -chpr_L.$(SUFFIX) chpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h +chpr_L.$(SUFFIX) chpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) -chpr_V.$(SUFFIX) chpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h +chpr_V.$(SUFFIX) chpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -chpr_M.$(SUFFIX) chpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h +chpr_M.$(SUFFIX) chpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zhpr_U.$(SUFFIX) zhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h +zhpr_U.$(SUFFIX) zhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) -zhpr_L.$(SUFFIX) zhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h +zhpr_L.$(SUFFIX) zhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) -zhpr_V.$(SUFFIX) zhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h +zhpr_V.$(SUFFIX) zhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zhpr_M.$(SUFFIX) zhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h +zhpr_M.$(SUFFIX) zhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xhpr_U.$(SUFFIX) xhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h +xhpr_U.$(SUFFIX) xhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) -xhpr_L.$(SUFFIX) xhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h +xhpr_L.$(SUFFIX) xhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) -xhpr_V.$(SUFFIX) xhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h +xhpr_V.$(SUFFIX) xhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xhpr_M.$(SUFFIX) xhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h +xhpr_M.$(SUFFIX) xhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -chpr_thread_U.$(SUFFIX) chpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h +chpr_thread_U.$(SUFFIX) chpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMV $< -o $(@F) -chpr_thread_L.$(SUFFIX) chpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h +chpr_thread_L.$(SUFFIX) chpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMV $< -o $(@F) -chpr_thread_V.$(SUFFIX) chpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h +chpr_thread_V.$(SUFFIX) chpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -chpr_thread_M.$(SUFFIX) chpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h +chpr_thread_M.$(SUFFIX) chpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -zhpr_thread_U.$(SUFFIX) zhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h +zhpr_thread_U.$(SUFFIX) zhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMV $< -o $(@F) -zhpr_thread_L.$(SUFFIX) zhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h +zhpr_thread_L.$(SUFFIX) zhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMV $< -o $(@F) -zhpr_thread_V.$(SUFFIX) zhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h +zhpr_thread_V.$(SUFFIX) zhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -zhpr_thread_M.$(SUFFIX) zhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h +zhpr_thread_M.$(SUFFIX) zhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -xhpr_thread_U.$(SUFFIX) xhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h +xhpr_thread_U.$(SUFFIX) xhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) -xhpr_thread_L.$(SUFFIX) xhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h +xhpr_thread_L.$(SUFFIX) xhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) -xhpr_thread_V.$(SUFFIX) xhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h +xhpr_thread_V.$(SUFFIX) xhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) -xhpr_thread_M.$(SUFFIX) xhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h +xhpr_thread_M.$(SUFFIX) xhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) -chpr2_U.$(SUFFIX) chpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h +chpr2_U.$(SUFFIX) chpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) -chpr2_L.$(SUFFIX) chpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h +chpr2_L.$(SUFFIX) chpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) -chpr2_V.$(SUFFIX) chpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h +chpr2_V.$(SUFFIX) chpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -chpr2_M.$(SUFFIX) chpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h +chpr2_M.$(SUFFIX) chpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) -zhpr2_U.$(SUFFIX) zhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h +zhpr2_U.$(SUFFIX) zhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) -zhpr2_L.$(SUFFIX) zhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h +zhpr2_L.$(SUFFIX) zhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) -zhpr2_V.$(SUFFIX) zhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h +zhpr2_V.$(SUFFIX) zhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -zhpr2_M.$(SUFFIX) zhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h +zhpr2_M.$(SUFFIX) zhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) -xhpr2_U.$(SUFFIX) xhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h +xhpr2_U.$(SUFFIX) xhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) -xhpr2_L.$(SUFFIX) xhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h +xhpr2_L.$(SUFFIX) xhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) -xhpr2_V.$(SUFFIX) xhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h +xhpr2_V.$(SUFFIX) xhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -xhpr2_M.$(SUFFIX) xhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h +xhpr2_M.$(SUFFIX) xhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) -chpr2_thread_U.$(SUFFIX) chpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +chpr2_thread_U.$(SUFFIX) chpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) -chpr2_thread_L.$(SUFFIX) chpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +chpr2_thread_L.$(SUFFIX) chpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) -chpr2_thread_V.$(SUFFIX) chpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h +chpr2_thread_V.$(SUFFIX) chpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -chpr2_thread_M.$(SUFFIX) chpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h +chpr2_thread_M.$(SUFFIX) chpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) -zhpr2_thread_U.$(SUFFIX) zhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +zhpr2_thread_U.$(SUFFIX) zhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) -zhpr2_thread_L.$(SUFFIX) zhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +zhpr2_thread_L.$(SUFFIX) zhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) -zhpr2_thread_V.$(SUFFIX) zhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h +zhpr2_thread_V.$(SUFFIX) zhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -zhpr2_thread_M.$(SUFFIX) zhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h +zhpr2_thread_M.$(SUFFIX) zhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) -xhpr2_thread_U.$(SUFFIX) xhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +xhpr2_thread_U.$(SUFFIX) xhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) -xhpr2_thread_L.$(SUFFIX) xhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +xhpr2_thread_L.$(SUFFIX) xhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) -xhpr2_thread_V.$(SUFFIX) xhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h +xhpr2_thread_V.$(SUFFIX) xhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) -xhpr2_thread_M.$(SUFFIX) xhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h +xhpr2_thread_M.$(SUFFIX) xhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) -ssbmv_U.$(SUFFIX) ssbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h +ssbmv_U.$(SUFFIX) ssbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -ssbmv_L.$(SUFFIX) ssbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h +ssbmv_L.$(SUFFIX) ssbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dsbmv_U.$(SUFFIX) dsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h +dsbmv_U.$(SUFFIX) dsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dsbmv_L.$(SUFFIX) dsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h +dsbmv_L.$(SUFFIX) dsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qsbmv_U.$(SUFFIX) qsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h +qsbmv_U.$(SUFFIX) qsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qsbmv_L.$(SUFFIX) qsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h +qsbmv_L.$(SUFFIX) qsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -csbmv_U.$(SUFFIX) csbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h +csbmv_U.$(SUFFIX) csbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -csbmv_L.$(SUFFIX) csbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h +csbmv_L.$(SUFFIX) csbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zsbmv_U.$(SUFFIX) zsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h +zsbmv_U.$(SUFFIX) zsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zsbmv_L.$(SUFFIX) zsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h +zsbmv_L.$(SUFFIX) zsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xsbmv_U.$(SUFFIX) xsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h +xsbmv_U.$(SUFFIX) xsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xsbmv_L.$(SUFFIX) xsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h +xsbmv_L.$(SUFFIX) xsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -ssbmv_thread_U.$(SUFFIX) ssbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +ssbmv_thread_U.$(SUFFIX) ssbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -ssbmv_thread_L.$(SUFFIX) ssbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +ssbmv_thread_L.$(SUFFIX) ssbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dsbmv_thread_U.$(SUFFIX) dsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +dsbmv_thread_U.$(SUFFIX) dsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dsbmv_thread_L.$(SUFFIX) dsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +dsbmv_thread_L.$(SUFFIX) dsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qsbmv_thread_U.$(SUFFIX) qsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +qsbmv_thread_U.$(SUFFIX) qsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qsbmv_thread_L.$(SUFFIX) qsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +qsbmv_thread_L.$(SUFFIX) qsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -csbmv_thread_U.$(SUFFIX) csbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +csbmv_thread_U.$(SUFFIX) csbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -csbmv_thread_L.$(SUFFIX) csbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +csbmv_thread_L.$(SUFFIX) csbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zsbmv_thread_U.$(SUFFIX) zsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +zsbmv_thread_U.$(SUFFIX) zsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zsbmv_thread_L.$(SUFFIX) zsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +zsbmv_thread_L.$(SUFFIX) zsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xsbmv_thread_U.$(SUFFIX) xsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h +xsbmv_thread_U.$(SUFFIX) xsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xsbmv_thread_L.$(SUFFIX) xsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h +xsbmv_thread_L.$(SUFFIX) xsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -sspmv_U.$(SUFFIX) sspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h +sspmv_U.$(SUFFIX) sspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -sspmv_L.$(SUFFIX) sspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h +sspmv_L.$(SUFFIX) sspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dspmv_U.$(SUFFIX) dspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h +dspmv_U.$(SUFFIX) dspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dspmv_L.$(SUFFIX) dspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h +dspmv_L.$(SUFFIX) dspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qspmv_U.$(SUFFIX) qspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h +qspmv_U.$(SUFFIX) qspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qspmv_L.$(SUFFIX) qspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h +qspmv_L.$(SUFFIX) qspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cspmv_U.$(SUFFIX) cspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h +cspmv_U.$(SUFFIX) cspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -cspmv_L.$(SUFFIX) cspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h +cspmv_L.$(SUFFIX) cspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zspmv_U.$(SUFFIX) zspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h +zspmv_U.$(SUFFIX) zspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zspmv_L.$(SUFFIX) zspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h +zspmv_L.$(SUFFIX) zspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xspmv_U.$(SUFFIX) xspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h +xspmv_U.$(SUFFIX) xspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xspmv_L.$(SUFFIX) xspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h +xspmv_L.$(SUFFIX) xspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -sspmv_thread_U.$(SUFFIX) sspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +sspmv_thread_U.$(SUFFIX) sspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -sspmv_thread_L.$(SUFFIX) sspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +sspmv_thread_L.$(SUFFIX) sspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dspmv_thread_U.$(SUFFIX) dspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +dspmv_thread_U.$(SUFFIX) dspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dspmv_thread_L.$(SUFFIX) dspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +dspmv_thread_L.$(SUFFIX) dspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qspmv_thread_U.$(SUFFIX) qspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +qspmv_thread_U.$(SUFFIX) qspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qspmv_thread_L.$(SUFFIX) qspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +qspmv_thread_L.$(SUFFIX) qspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cspmv_thread_U.$(SUFFIX) cspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +cspmv_thread_U.$(SUFFIX) cspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -cspmv_thread_L.$(SUFFIX) cspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +cspmv_thread_L.$(SUFFIX) cspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zspmv_thread_U.$(SUFFIX) zspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +zspmv_thread_U.$(SUFFIX) zspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zspmv_thread_L.$(SUFFIX) zspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +zspmv_thread_L.$(SUFFIX) zspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xspmv_thread_U.$(SUFFIX) xspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h +xspmv_thread_U.$(SUFFIX) xspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xspmv_thread_L.$(SUFFIX) xspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h +xspmv_thread_L.$(SUFFIX) xspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -sspr_U.$(SUFFIX) sspr_U.$(PSUFFIX) : spr_k.c ../../param.h +sspr_U.$(SUFFIX) sspr_U.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -sspr_L.$(SUFFIX) sspr_L.$(PSUFFIX) : spr_k.c ../../param.h +sspr_L.$(SUFFIX) sspr_L.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dspr_U.$(SUFFIX) dspr_U.$(PSUFFIX) : spr_k.c ../../param.h +dspr_U.$(SUFFIX) dspr_U.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dspr_L.$(SUFFIX) dspr_L.$(PSUFFIX) : spr_k.c ../../param.h +dspr_L.$(SUFFIX) dspr_L.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qspr_U.$(SUFFIX) qspr_U.$(PSUFFIX) : spr_k.c ../../param.h +qspr_U.$(SUFFIX) qspr_U.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qspr_L.$(SUFFIX) qspr_L.$(PSUFFIX) : spr_k.c ../../param.h +qspr_L.$(SUFFIX) qspr_L.$(PSUFFIX) : spr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cspr_U.$(SUFFIX) cspr_U.$(PSUFFIX) : zspr_k.c ../../param.h +cspr_U.$(SUFFIX) cspr_U.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -cspr_L.$(SUFFIX) cspr_L.$(PSUFFIX) : zspr_k.c ../../param.h +cspr_L.$(SUFFIX) cspr_L.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zspr_U.$(SUFFIX) zspr_U.$(PSUFFIX) : zspr_k.c ../../param.h +zspr_U.$(SUFFIX) zspr_U.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zspr_L.$(SUFFIX) zspr_L.$(PSUFFIX) : zspr_k.c ../../param.h +zspr_L.$(SUFFIX) zspr_L.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xspr_U.$(SUFFIX) xspr_U.$(PSUFFIX) : zspr_k.c ../../param.h +xspr_U.$(SUFFIX) xspr_U.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xspr_L.$(SUFFIX) xspr_L.$(PSUFFIX) : zspr_k.c ../../param.h +xspr_L.$(SUFFIX) xspr_L.$(PSUFFIX) : zspr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -sspr_thread_U.$(SUFFIX) sspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h +sspr_thread_U.$(SUFFIX) sspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -sspr_thread_L.$(SUFFIX) sspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h +sspr_thread_L.$(SUFFIX) sspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dspr_thread_U.$(SUFFIX) dspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h +dspr_thread_U.$(SUFFIX) dspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dspr_thread_L.$(SUFFIX) dspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h +dspr_thread_L.$(SUFFIX) dspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qspr_thread_U.$(SUFFIX) qspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h +qspr_thread_U.$(SUFFIX) qspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qspr_thread_L.$(SUFFIX) qspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h +qspr_thread_L.$(SUFFIX) qspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cspr_thread_U.$(SUFFIX) cspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h +cspr_thread_U.$(SUFFIX) cspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -cspr_thread_L.$(SUFFIX) cspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h +cspr_thread_L.$(SUFFIX) cspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zspr_thread_U.$(SUFFIX) zspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h +zspr_thread_U.$(SUFFIX) zspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zspr_thread_L.$(SUFFIX) zspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h +zspr_thread_L.$(SUFFIX) zspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xspr_thread_U.$(SUFFIX) xspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h +xspr_thread_U.$(SUFFIX) xspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xspr_thread_L.$(SUFFIX) xspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h +xspr_thread_L.$(SUFFIX) xspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -sspr2_U.$(SUFFIX) sspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h +sspr2_U.$(SUFFIX) sspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -sspr2_L.$(SUFFIX) sspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h +sspr2_L.$(SUFFIX) sspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dspr2_U.$(SUFFIX) dspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h +dspr2_U.$(SUFFIX) dspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dspr2_L.$(SUFFIX) dspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h +dspr2_L.$(SUFFIX) dspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qspr2_U.$(SUFFIX) qspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h +qspr2_U.$(SUFFIX) qspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qspr2_L.$(SUFFIX) qspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h +qspr2_L.$(SUFFIX) qspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cspr2_U.$(SUFFIX) cspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h +cspr2_U.$(SUFFIX) cspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -cspr2_L.$(SUFFIX) cspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h +cspr2_L.$(SUFFIX) cspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zspr2_U.$(SUFFIX) zspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h +zspr2_U.$(SUFFIX) zspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zspr2_L.$(SUFFIX) zspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h +zspr2_L.$(SUFFIX) zspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xspr2_U.$(SUFFIX) xspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h +xspr2_U.$(SUFFIX) xspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xspr2_L.$(SUFFIX) xspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h +xspr2_L.$(SUFFIX) xspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -sspr2_thread_U.$(SUFFIX) sspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +sspr2_thread_U.$(SUFFIX) sspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -sspr2_thread_L.$(SUFFIX) sspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +sspr2_thread_L.$(SUFFIX) sspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dspr2_thread_U.$(SUFFIX) dspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +dspr2_thread_U.$(SUFFIX) dspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dspr2_thread_L.$(SUFFIX) dspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +dspr2_thread_L.$(SUFFIX) dspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qspr2_thread_U.$(SUFFIX) qspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +qspr2_thread_U.$(SUFFIX) qspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qspr2_thread_L.$(SUFFIX) qspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +qspr2_thread_L.$(SUFFIX) qspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -cspr2_thread_U.$(SUFFIX) cspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +cspr2_thread_U.$(SUFFIX) cspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -cspr2_thread_L.$(SUFFIX) cspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +cspr2_thread_L.$(SUFFIX) cspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zspr2_thread_U.$(SUFFIX) zspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +zspr2_thread_U.$(SUFFIX) zspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zspr2_thread_L.$(SUFFIX) zspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +zspr2_thread_L.$(SUFFIX) zspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xspr2_thread_U.$(SUFFIX) xspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h +xspr2_thread_U.$(SUFFIX) xspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xspr2_thread_L.$(SUFFIX) xspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h +xspr2_thread_L.$(SUFFIX) xspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -ssyr_U.$(SUFFIX) ssyr_U.$(PSUFFIX) : syr_k.c ../../param.h +ssyr_U.$(SUFFIX) ssyr_U.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -ssyr_L.$(SUFFIX) ssyr_L.$(PSUFFIX) : syr_k.c ../../param.h +ssyr_L.$(SUFFIX) ssyr_L.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dsyr_U.$(SUFFIX) dsyr_U.$(PSUFFIX) : syr_k.c ../../param.h +dsyr_U.$(SUFFIX) dsyr_U.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dsyr_L.$(SUFFIX) dsyr_L.$(PSUFFIX) : syr_k.c ../../param.h +dsyr_L.$(SUFFIX) dsyr_L.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qsyr_U.$(SUFFIX) qsyr_U.$(PSUFFIX) : syr_k.c ../../param.h +qsyr_U.$(SUFFIX) qsyr_U.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qsyr_L.$(SUFFIX) qsyr_L.$(PSUFFIX) : syr_k.c ../../param.h +qsyr_L.$(SUFFIX) qsyr_L.$(PSUFFIX) : syr_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -csyr_U.$(SUFFIX) csyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h +csyr_U.$(SUFFIX) csyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -csyr_L.$(SUFFIX) csyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h +csyr_L.$(SUFFIX) csyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zsyr_U.$(SUFFIX) zsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h +zsyr_U.$(SUFFIX) zsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zsyr_L.$(SUFFIX) zsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h +zsyr_L.$(SUFFIX) zsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xsyr_U.$(SUFFIX) xsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h +xsyr_U.$(SUFFIX) xsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xsyr_L.$(SUFFIX) xsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h +xsyr_L.$(SUFFIX) xsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -ssyr2_U.$(SUFFIX) ssyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h +ssyr2_U.$(SUFFIX) ssyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -ssyr2_L.$(SUFFIX) ssyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h +ssyr2_L.$(SUFFIX) ssyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -dsyr2_U.$(SUFFIX) dsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h +dsyr2_U.$(SUFFIX) dsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -dsyr2_L.$(SUFFIX) dsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h +dsyr2_L.$(SUFFIX) dsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -qsyr2_U.$(SUFFIX) qsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h +qsyr2_U.$(SUFFIX) qsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -qsyr2_L.$(SUFFIX) qsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h +qsyr2_L.$(SUFFIX) qsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -csyr2_U.$(SUFFIX) csyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h +csyr2_U.$(SUFFIX) csyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) -csyr2_L.$(SUFFIX) csyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h +csyr2_L.$(SUFFIX) csyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) -zsyr2_U.$(SUFFIX) zsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h +zsyr2_U.$(SUFFIX) zsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) -zsyr2_L.$(SUFFIX) zsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h +zsyr2_L.$(SUFFIX) zsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) -xsyr2_U.$(SUFFIX) xsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h +xsyr2_U.$(SUFFIX) xsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) -xsyr2_L.$(SUFFIX) xsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h +xsyr2_L.$(SUFFIX) xsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) -stbmv_NUU.$(SUFFIX) stbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h +stbmv_NUU.$(SUFFIX) stbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stbmv_NUN.$(SUFFIX) stbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h +stbmv_NUN.$(SUFFIX) stbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stbmv_TLU.$(SUFFIX) stbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h +stbmv_TLU.$(SUFFIX) stbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stbmv_TLN.$(SUFFIX) stbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h +stbmv_TLN.$(SUFFIX) stbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -stbmv_NLU.$(SUFFIX) stbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h +stbmv_NLU.$(SUFFIX) stbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stbmv_NLN.$(SUFFIX) stbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h +stbmv_NLN.$(SUFFIX) stbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stbmv_TUU.$(SUFFIX) stbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h +stbmv_TUU.$(SUFFIX) stbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stbmv_TUN.$(SUFFIX) stbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h +stbmv_TUN.$(SUFFIX) stbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtbmv_NUU.$(SUFFIX) dtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h +dtbmv_NUU.$(SUFFIX) dtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtbmv_NUN.$(SUFFIX) dtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h +dtbmv_NUN.$(SUFFIX) dtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtbmv_TLU.$(SUFFIX) dtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h +dtbmv_TLU.$(SUFFIX) dtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtbmv_TLN.$(SUFFIX) dtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h +dtbmv_TLN.$(SUFFIX) dtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtbmv_NLU.$(SUFFIX) dtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h +dtbmv_NLU.$(SUFFIX) dtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtbmv_NLN.$(SUFFIX) dtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h +dtbmv_NLN.$(SUFFIX) dtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtbmv_TUU.$(SUFFIX) dtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h +dtbmv_TUU.$(SUFFIX) dtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtbmv_TUN.$(SUFFIX) dtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h +dtbmv_TUN.$(SUFFIX) dtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtbmv_NUU.$(SUFFIX) qtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h +qtbmv_NUU.$(SUFFIX) qtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtbmv_NUN.$(SUFFIX) qtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h +qtbmv_NUN.$(SUFFIX) qtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtbmv_TLU.$(SUFFIX) qtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h +qtbmv_TLU.$(SUFFIX) qtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtbmv_TLN.$(SUFFIX) qtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h +qtbmv_TLN.$(SUFFIX) qtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtbmv_NLU.$(SUFFIX) qtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h +qtbmv_NLU.$(SUFFIX) qtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtbmv_NLN.$(SUFFIX) qtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h +qtbmv_NLN.$(SUFFIX) qtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtbmv_TUU.$(SUFFIX) qtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h +qtbmv_TUU.$(SUFFIX) qtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtbmv_TUN.$(SUFFIX) qtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h +qtbmv_TUN.$(SUFFIX) qtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -ctbmv_NUU.$(SUFFIX) ctbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_NUU.$(SUFFIX) ctbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctbmv_NUN.$(SUFFIX) ctbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_NUN.$(SUFFIX) ctbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctbmv_TLU.$(SUFFIX) ctbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_TLU.$(SUFFIX) ctbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctbmv_TLN.$(SUFFIX) ctbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_TLN.$(SUFFIX) ctbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctbmv_RLU.$(SUFFIX) ctbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_RLU.$(SUFFIX) ctbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctbmv_RLN.$(SUFFIX) ctbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_RLN.$(SUFFIX) ctbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctbmv_CLU.$(SUFFIX) ctbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_CLU.$(SUFFIX) ctbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctbmv_CLN.$(SUFFIX) ctbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_CLN.$(SUFFIX) ctbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ctbmv_NLU.$(SUFFIX) ctbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_NLU.$(SUFFIX) ctbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctbmv_NLN.$(SUFFIX) ctbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_NLN.$(SUFFIX) ctbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctbmv_TUU.$(SUFFIX) ctbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_TUU.$(SUFFIX) ctbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctbmv_TUN.$(SUFFIX) ctbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_TUN.$(SUFFIX) ctbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctbmv_RUU.$(SUFFIX) ctbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_RUU.$(SUFFIX) ctbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctbmv_RUN.$(SUFFIX) ctbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ctbmv_RUN.$(SUFFIX) ctbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctbmv_CUU.$(SUFFIX) ctbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_CUU.$(SUFFIX) ctbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctbmv_CUN.$(SUFFIX) ctbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ctbmv_CUN.$(SUFFIX) ctbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztbmv_NUU.$(SUFFIX) ztbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_NUU.$(SUFFIX) ztbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztbmv_NUN.$(SUFFIX) ztbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_NUN.$(SUFFIX) ztbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztbmv_TLU.$(SUFFIX) ztbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_TLU.$(SUFFIX) ztbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztbmv_TLN.$(SUFFIX) ztbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_TLN.$(SUFFIX) ztbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztbmv_RLU.$(SUFFIX) ztbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_RLU.$(SUFFIX) ztbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztbmv_RLN.$(SUFFIX) ztbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_RLN.$(SUFFIX) ztbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztbmv_CLU.$(SUFFIX) ztbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_CLU.$(SUFFIX) ztbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztbmv_CLN.$(SUFFIX) ztbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_CLN.$(SUFFIX) ztbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztbmv_NLU.$(SUFFIX) ztbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_NLU.$(SUFFIX) ztbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztbmv_NLN.$(SUFFIX) ztbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_NLN.$(SUFFIX) ztbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztbmv_TUU.$(SUFFIX) ztbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_TUU.$(SUFFIX) ztbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztbmv_TUN.$(SUFFIX) ztbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_TUN.$(SUFFIX) ztbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztbmv_RUU.$(SUFFIX) ztbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_RUU.$(SUFFIX) ztbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztbmv_RUN.$(SUFFIX) ztbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h +ztbmv_RUN.$(SUFFIX) ztbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztbmv_CUU.$(SUFFIX) ztbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_CUU.$(SUFFIX) ztbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztbmv_CUN.$(SUFFIX) ztbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h +ztbmv_CUN.$(SUFFIX) ztbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtbmv_NUU.$(SUFFIX) xtbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_NUU.$(SUFFIX) xtbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtbmv_NUN.$(SUFFIX) xtbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_NUN.$(SUFFIX) xtbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtbmv_TLU.$(SUFFIX) xtbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_TLU.$(SUFFIX) xtbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtbmv_TLN.$(SUFFIX) xtbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_TLN.$(SUFFIX) xtbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtbmv_RLU.$(SUFFIX) xtbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_RLU.$(SUFFIX) xtbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtbmv_RLN.$(SUFFIX) xtbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_RLN.$(SUFFIX) xtbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtbmv_CLU.$(SUFFIX) xtbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_CLU.$(SUFFIX) xtbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtbmv_CLN.$(SUFFIX) xtbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_CLN.$(SUFFIX) xtbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtbmv_NLU.$(SUFFIX) xtbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_NLU.$(SUFFIX) xtbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtbmv_NLN.$(SUFFIX) xtbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_NLN.$(SUFFIX) xtbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtbmv_TUU.$(SUFFIX) xtbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_TUU.$(SUFFIX) xtbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtbmv_TUN.$(SUFFIX) xtbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_TUN.$(SUFFIX) xtbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtbmv_RUU.$(SUFFIX) xtbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_RUU.$(SUFFIX) xtbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtbmv_RUN.$(SUFFIX) xtbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h +xtbmv_RUN.$(SUFFIX) xtbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtbmv_CUU.$(SUFFIX) xtbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_CUU.$(SUFFIX) xtbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtbmv_CUN.$(SUFFIX) xtbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h +xtbmv_CUN.$(SUFFIX) xtbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -stbmv_thread_NUN.$(SUFFIX) stbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_NUN.$(SUFFIX) stbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -stbmv_thread_TLN.$(SUFFIX) stbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_TLN.$(SUFFIX) stbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -stbmv_thread_NLN.$(SUFFIX) stbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_NLN.$(SUFFIX) stbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -stbmv_thread_TUN.$(SUFFIX) stbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +stbmv_thread_TUN.$(SUFFIX) stbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -dtbmv_thread_NUN.$(SUFFIX) dtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_NUN.$(SUFFIX) dtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -dtbmv_thread_TLN.$(SUFFIX) dtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_TLN.$(SUFFIX) dtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -dtbmv_thread_NLN.$(SUFFIX) dtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_NLN.$(SUFFIX) dtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -dtbmv_thread_TUN.$(SUFFIX) dtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +dtbmv_thread_TUN.$(SUFFIX) dtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -qtbmv_thread_NUN.$(SUFFIX) qtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_NUN.$(SUFFIX) qtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -qtbmv_thread_TLN.$(SUFFIX) qtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_TLN.$(SUFFIX) qtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -qtbmv_thread_NLN.$(SUFFIX) qtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_NLN.$(SUFFIX) qtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -qtbmv_thread_TUN.$(SUFFIX) qtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +qtbmv_thread_TUN.$(SUFFIX) qtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ctbmv_thread_NUN.$(SUFFIX) ctbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_NUN.$(SUFFIX) ctbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ctbmv_thread_TLN.$(SUFFIX) ctbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_TLN.$(SUFFIX) ctbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ctbmv_thread_RLN.$(SUFFIX) ctbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_RLN.$(SUFFIX) ctbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ctbmv_thread_CLN.$(SUFFIX) ctbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_CLN.$(SUFFIX) ctbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ctbmv_thread_NLN.$(SUFFIX) ctbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_NLN.$(SUFFIX) ctbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ctbmv_thread_TUN.$(SUFFIX) ctbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_TUN.$(SUFFIX) ctbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ctbmv_thread_RUN.$(SUFFIX) ctbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_RUN.$(SUFFIX) ctbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ctbmv_thread_CUN.$(SUFFIX) ctbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ctbmv_thread_CUN.$(SUFFIX) ctbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ztbmv_thread_NUN.$(SUFFIX) ztbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_NUN.$(SUFFIX) ztbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ztbmv_thread_TLN.$(SUFFIX) ztbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_TLN.$(SUFFIX) ztbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ztbmv_thread_RLN.$(SUFFIX) ztbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_RLN.$(SUFFIX) ztbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ztbmv_thread_CLN.$(SUFFIX) ztbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_CLN.$(SUFFIX) ztbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ztbmv_thread_NLN.$(SUFFIX) ztbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_NLN.$(SUFFIX) ztbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ztbmv_thread_TUN.$(SUFFIX) ztbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_TUN.$(SUFFIX) ztbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ztbmv_thread_RUN.$(SUFFIX) ztbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_RUN.$(SUFFIX) ztbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ztbmv_thread_CUN.$(SUFFIX) ztbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +ztbmv_thread_CUN.$(SUFFIX) ztbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -xtbmv_thread_NUN.$(SUFFIX) xtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_NUN.$(SUFFIX) xtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -xtbmv_thread_TLN.$(SUFFIX) xtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_TLN.$(SUFFIX) xtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -xtbmv_thread_RLN.$(SUFFIX) xtbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_RLN.$(SUFFIX) xtbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -xtbmv_thread_CLN.$(SUFFIX) xtbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_CLN.$(SUFFIX) xtbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -xtbmv_thread_NLN.$(SUFFIX) xtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_NLN.$(SUFFIX) xtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -xtbmv_thread_TUN.$(SUFFIX) xtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_TUN.$(SUFFIX) xtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -xtbmv_thread_RUN.$(SUFFIX) xtbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_RUN.$(SUFFIX) xtbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -xtbmv_thread_CUN.$(SUFFIX) xtbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h +xtbmv_thread_CUN.$(SUFFIX) xtbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -stbsv_NUU.$(SUFFIX) stbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h +stbsv_NUU.$(SUFFIX) stbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stbsv_NUN.$(SUFFIX) stbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h +stbsv_NUN.$(SUFFIX) stbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stbsv_TLU.$(SUFFIX) stbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h +stbsv_TLU.$(SUFFIX) stbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stbsv_TLN.$(SUFFIX) stbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h +stbsv_TLN.$(SUFFIX) stbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -stbsv_NLU.$(SUFFIX) stbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h +stbsv_NLU.$(SUFFIX) stbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stbsv_NLN.$(SUFFIX) stbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h +stbsv_NLN.$(SUFFIX) stbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stbsv_TUU.$(SUFFIX) stbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h +stbsv_TUU.$(SUFFIX) stbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stbsv_TUN.$(SUFFIX) stbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h +stbsv_TUN.$(SUFFIX) stbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtbsv_NUU.$(SUFFIX) dtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h +dtbsv_NUU.$(SUFFIX) dtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtbsv_NUN.$(SUFFIX) dtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h +dtbsv_NUN.$(SUFFIX) dtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtbsv_TLU.$(SUFFIX) dtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h +dtbsv_TLU.$(SUFFIX) dtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtbsv_TLN.$(SUFFIX) dtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h +dtbsv_TLN.$(SUFFIX) dtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtbsv_NLU.$(SUFFIX) dtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h +dtbsv_NLU.$(SUFFIX) dtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtbsv_NLN.$(SUFFIX) dtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h +dtbsv_NLN.$(SUFFIX) dtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtbsv_TUU.$(SUFFIX) dtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h +dtbsv_TUU.$(SUFFIX) dtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtbsv_TUN.$(SUFFIX) dtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h +dtbsv_TUN.$(SUFFIX) dtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtbsv_NUU.$(SUFFIX) qtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h +qtbsv_NUU.$(SUFFIX) qtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtbsv_NUN.$(SUFFIX) qtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h +qtbsv_NUN.$(SUFFIX) qtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtbsv_TLU.$(SUFFIX) qtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h +qtbsv_TLU.$(SUFFIX) qtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtbsv_TLN.$(SUFFIX) qtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h +qtbsv_TLN.$(SUFFIX) qtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtbsv_NLU.$(SUFFIX) qtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h +qtbsv_NLU.$(SUFFIX) qtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtbsv_NLN.$(SUFFIX) qtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h +qtbsv_NLN.$(SUFFIX) qtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtbsv_TUU.$(SUFFIX) qtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h +qtbsv_TUU.$(SUFFIX) qtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtbsv_TUN.$(SUFFIX) qtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h +qtbsv_TUN.$(SUFFIX) qtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -ctbsv_NUU.$(SUFFIX) ctbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_NUU.$(SUFFIX) ctbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctbsv_NUN.$(SUFFIX) ctbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_NUN.$(SUFFIX) ctbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctbsv_TLU.$(SUFFIX) ctbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_TLU.$(SUFFIX) ctbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctbsv_TLN.$(SUFFIX) ctbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_TLN.$(SUFFIX) ctbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctbsv_RLU.$(SUFFIX) ctbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_RLU.$(SUFFIX) ctbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctbsv_RLN.$(SUFFIX) ctbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_RLN.$(SUFFIX) ctbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctbsv_CLU.$(SUFFIX) ctbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_CLU.$(SUFFIX) ctbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctbsv_CLN.$(SUFFIX) ctbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_CLN.$(SUFFIX) ctbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ctbsv_NLU.$(SUFFIX) ctbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_NLU.$(SUFFIX) ctbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctbsv_NLN.$(SUFFIX) ctbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_NLN.$(SUFFIX) ctbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctbsv_TUU.$(SUFFIX) ctbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_TUU.$(SUFFIX) ctbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctbsv_TUN.$(SUFFIX) ctbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_TUN.$(SUFFIX) ctbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctbsv_RUU.$(SUFFIX) ctbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_RUU.$(SUFFIX) ctbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctbsv_RUN.$(SUFFIX) ctbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ctbsv_RUN.$(SUFFIX) ctbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctbsv_CUU.$(SUFFIX) ctbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_CUU.$(SUFFIX) ctbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctbsv_CUN.$(SUFFIX) ctbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ctbsv_CUN.$(SUFFIX) ctbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztbsv_NUU.$(SUFFIX) ztbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_NUU.$(SUFFIX) ztbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztbsv_NUN.$(SUFFIX) ztbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_NUN.$(SUFFIX) ztbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztbsv_TLU.$(SUFFIX) ztbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_TLU.$(SUFFIX) ztbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztbsv_TLN.$(SUFFIX) ztbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_TLN.$(SUFFIX) ztbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztbsv_RLU.$(SUFFIX) ztbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_RLU.$(SUFFIX) ztbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztbsv_RLN.$(SUFFIX) ztbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_RLN.$(SUFFIX) ztbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztbsv_CLU.$(SUFFIX) ztbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_CLU.$(SUFFIX) ztbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztbsv_CLN.$(SUFFIX) ztbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_CLN.$(SUFFIX) ztbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztbsv_NLU.$(SUFFIX) ztbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_NLU.$(SUFFIX) ztbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztbsv_NLN.$(SUFFIX) ztbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_NLN.$(SUFFIX) ztbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztbsv_TUU.$(SUFFIX) ztbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_TUU.$(SUFFIX) ztbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztbsv_TUN.$(SUFFIX) ztbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_TUN.$(SUFFIX) ztbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztbsv_RUU.$(SUFFIX) ztbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_RUU.$(SUFFIX) ztbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztbsv_RUN.$(SUFFIX) ztbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h +ztbsv_RUN.$(SUFFIX) ztbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztbsv_CUU.$(SUFFIX) ztbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_CUU.$(SUFFIX) ztbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztbsv_CUN.$(SUFFIX) ztbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h +ztbsv_CUN.$(SUFFIX) ztbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtbsv_NUU.$(SUFFIX) xtbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_NUU.$(SUFFIX) xtbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtbsv_NUN.$(SUFFIX) xtbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_NUN.$(SUFFIX) xtbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtbsv_TLU.$(SUFFIX) xtbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_TLU.$(SUFFIX) xtbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtbsv_TLN.$(SUFFIX) xtbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_TLN.$(SUFFIX) xtbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtbsv_RLU.$(SUFFIX) xtbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_RLU.$(SUFFIX) xtbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtbsv_RLN.$(SUFFIX) xtbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_RLN.$(SUFFIX) xtbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtbsv_CLU.$(SUFFIX) xtbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_CLU.$(SUFFIX) xtbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtbsv_CLN.$(SUFFIX) xtbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_CLN.$(SUFFIX) xtbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtbsv_NLU.$(SUFFIX) xtbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_NLU.$(SUFFIX) xtbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtbsv_NLN.$(SUFFIX) xtbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_NLN.$(SUFFIX) xtbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtbsv_TUU.$(SUFFIX) xtbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_TUU.$(SUFFIX) xtbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtbsv_TUN.$(SUFFIX) xtbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_TUN.$(SUFFIX) xtbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtbsv_RUU.$(SUFFIX) xtbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_RUU.$(SUFFIX) xtbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtbsv_RUN.$(SUFFIX) xtbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h +xtbsv_RUN.$(SUFFIX) xtbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtbsv_CUU.$(SUFFIX) xtbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_CUU.$(SUFFIX) xtbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtbsv_CUN.$(SUFFIX) xtbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h +xtbsv_CUN.$(SUFFIX) xtbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -stpmv_NUU.$(SUFFIX) stpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h +stpmv_NUU.$(SUFFIX) stpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stpmv_NUN.$(SUFFIX) stpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h +stpmv_NUN.$(SUFFIX) stpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stpmv_TLU.$(SUFFIX) stpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h +stpmv_TLU.$(SUFFIX) stpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stpmv_TLN.$(SUFFIX) stpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h +stpmv_TLN.$(SUFFIX) stpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -stpmv_NLU.$(SUFFIX) stpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h +stpmv_NLU.$(SUFFIX) stpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stpmv_NLN.$(SUFFIX) stpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h +stpmv_NLN.$(SUFFIX) stpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stpmv_TUU.$(SUFFIX) stpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h +stpmv_TUU.$(SUFFIX) stpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stpmv_TUN.$(SUFFIX) stpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h +stpmv_TUN.$(SUFFIX) stpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtpmv_NUU.$(SUFFIX) dtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h +dtpmv_NUU.$(SUFFIX) dtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtpmv_NUN.$(SUFFIX) dtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h +dtpmv_NUN.$(SUFFIX) dtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtpmv_TLU.$(SUFFIX) dtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h +dtpmv_TLU.$(SUFFIX) dtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtpmv_TLN.$(SUFFIX) dtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h +dtpmv_TLN.$(SUFFIX) dtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtpmv_NLU.$(SUFFIX) dtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h +dtpmv_NLU.$(SUFFIX) dtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtpmv_NLN.$(SUFFIX) dtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h +dtpmv_NLN.$(SUFFIX) dtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtpmv_TUU.$(SUFFIX) dtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h +dtpmv_TUU.$(SUFFIX) dtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtpmv_TUN.$(SUFFIX) dtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h +dtpmv_TUN.$(SUFFIX) dtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtpmv_NUU.$(SUFFIX) qtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h +qtpmv_NUU.$(SUFFIX) qtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtpmv_NUN.$(SUFFIX) qtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h +qtpmv_NUN.$(SUFFIX) qtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtpmv_TLU.$(SUFFIX) qtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h +qtpmv_TLU.$(SUFFIX) qtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtpmv_TLN.$(SUFFIX) qtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h +qtpmv_TLN.$(SUFFIX) qtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtpmv_NLU.$(SUFFIX) qtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h +qtpmv_NLU.$(SUFFIX) qtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtpmv_NLN.$(SUFFIX) qtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h +qtpmv_NLN.$(SUFFIX) qtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtpmv_TUU.$(SUFFIX) qtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h +qtpmv_TUU.$(SUFFIX) qtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtpmv_TUN.$(SUFFIX) qtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h +qtpmv_TUN.$(SUFFIX) qtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -ctpmv_NUU.$(SUFFIX) ctpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_NUU.$(SUFFIX) ctpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctpmv_NUN.$(SUFFIX) ctpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_NUN.$(SUFFIX) ctpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctpmv_TLU.$(SUFFIX) ctpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_TLU.$(SUFFIX) ctpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctpmv_TLN.$(SUFFIX) ctpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_TLN.$(SUFFIX) ctpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctpmv_RLU.$(SUFFIX) ctpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_RLU.$(SUFFIX) ctpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctpmv_RLN.$(SUFFIX) ctpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_RLN.$(SUFFIX) ctpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctpmv_CLU.$(SUFFIX) ctpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_CLU.$(SUFFIX) ctpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctpmv_CLN.$(SUFFIX) ctpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_CLN.$(SUFFIX) ctpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ctpmv_NLU.$(SUFFIX) ctpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_NLU.$(SUFFIX) ctpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctpmv_NLN.$(SUFFIX) ctpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_NLN.$(SUFFIX) ctpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctpmv_TUU.$(SUFFIX) ctpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_TUU.$(SUFFIX) ctpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctpmv_TUN.$(SUFFIX) ctpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_TUN.$(SUFFIX) ctpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctpmv_RUU.$(SUFFIX) ctpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_RUU.$(SUFFIX) ctpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctpmv_RUN.$(SUFFIX) ctpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ctpmv_RUN.$(SUFFIX) ctpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctpmv_CUU.$(SUFFIX) ctpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_CUU.$(SUFFIX) ctpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctpmv_CUN.$(SUFFIX) ctpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ctpmv_CUN.$(SUFFIX) ctpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztpmv_NUU.$(SUFFIX) ztpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_NUU.$(SUFFIX) ztpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztpmv_NUN.$(SUFFIX) ztpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_NUN.$(SUFFIX) ztpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztpmv_TLU.$(SUFFIX) ztpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_TLU.$(SUFFIX) ztpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztpmv_TLN.$(SUFFIX) ztpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_TLN.$(SUFFIX) ztpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztpmv_RLU.$(SUFFIX) ztpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_RLU.$(SUFFIX) ztpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztpmv_RLN.$(SUFFIX) ztpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_RLN.$(SUFFIX) ztpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztpmv_CLU.$(SUFFIX) ztpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_CLU.$(SUFFIX) ztpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztpmv_CLN.$(SUFFIX) ztpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_CLN.$(SUFFIX) ztpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztpmv_NLU.$(SUFFIX) ztpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_NLU.$(SUFFIX) ztpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztpmv_NLN.$(SUFFIX) ztpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_NLN.$(SUFFIX) ztpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztpmv_TUU.$(SUFFIX) ztpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_TUU.$(SUFFIX) ztpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztpmv_TUN.$(SUFFIX) ztpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_TUN.$(SUFFIX) ztpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztpmv_RUU.$(SUFFIX) ztpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_RUU.$(SUFFIX) ztpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztpmv_RUN.$(SUFFIX) ztpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h +ztpmv_RUN.$(SUFFIX) ztpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztpmv_CUU.$(SUFFIX) ztpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_CUU.$(SUFFIX) ztpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztpmv_CUN.$(SUFFIX) ztpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h +ztpmv_CUN.$(SUFFIX) ztpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtpmv_NUU.$(SUFFIX) xtpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_NUU.$(SUFFIX) xtpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtpmv_NUN.$(SUFFIX) xtpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_NUN.$(SUFFIX) xtpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtpmv_TLU.$(SUFFIX) xtpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_TLU.$(SUFFIX) xtpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtpmv_TLN.$(SUFFIX) xtpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_TLN.$(SUFFIX) xtpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtpmv_RLU.$(SUFFIX) xtpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_RLU.$(SUFFIX) xtpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtpmv_RLN.$(SUFFIX) xtpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_RLN.$(SUFFIX) xtpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtpmv_CLU.$(SUFFIX) xtpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_CLU.$(SUFFIX) xtpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtpmv_CLN.$(SUFFIX) xtpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_CLN.$(SUFFIX) xtpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtpmv_NLU.$(SUFFIX) xtpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_NLU.$(SUFFIX) xtpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtpmv_NLN.$(SUFFIX) xtpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_NLN.$(SUFFIX) xtpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtpmv_TUU.$(SUFFIX) xtpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_TUU.$(SUFFIX) xtpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtpmv_TUN.$(SUFFIX) xtpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_TUN.$(SUFFIX) xtpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtpmv_RUU.$(SUFFIX) xtpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_RUU.$(SUFFIX) xtpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtpmv_RUN.$(SUFFIX) xtpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h +xtpmv_RUN.$(SUFFIX) xtpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtpmv_CUU.$(SUFFIX) xtpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_CUU.$(SUFFIX) xtpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtpmv_CUN.$(SUFFIX) xtpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h +xtpmv_CUN.$(SUFFIX) xtpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -stpmv_thread_NUN.$(SUFFIX) stpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_NUN.$(SUFFIX) stpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -stpmv_thread_TLN.$(SUFFIX) stpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_TLN.$(SUFFIX) stpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -stpmv_thread_NLN.$(SUFFIX) stpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_NLN.$(SUFFIX) stpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -stpmv_thread_TUN.$(SUFFIX) stpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +stpmv_thread_TUN.$(SUFFIX) stpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -dtpmv_thread_NUN.$(SUFFIX) dtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_NUN.$(SUFFIX) dtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -dtpmv_thread_TLN.$(SUFFIX) dtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_TLN.$(SUFFIX) dtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -dtpmv_thread_NLN.$(SUFFIX) dtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_NLN.$(SUFFIX) dtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -dtpmv_thread_TUN.$(SUFFIX) dtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +dtpmv_thread_TUN.$(SUFFIX) dtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -qtpmv_thread_NUN.$(SUFFIX) qtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_NUN.$(SUFFIX) qtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -qtpmv_thread_TLN.$(SUFFIX) qtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_TLN.$(SUFFIX) qtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -qtpmv_thread_NLN.$(SUFFIX) qtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_NLN.$(SUFFIX) qtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -qtpmv_thread_TUN.$(SUFFIX) qtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +qtpmv_thread_TUN.$(SUFFIX) qtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ctpmv_thread_NUN.$(SUFFIX) ctpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_NUN.$(SUFFIX) ctpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ctpmv_thread_TLN.$(SUFFIX) ctpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_TLN.$(SUFFIX) ctpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ctpmv_thread_RLN.$(SUFFIX) ctpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_RLN.$(SUFFIX) ctpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ctpmv_thread_CLN.$(SUFFIX) ctpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_CLN.$(SUFFIX) ctpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ctpmv_thread_NLN.$(SUFFIX) ctpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_NLN.$(SUFFIX) ctpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ctpmv_thread_TUN.$(SUFFIX) ctpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_TUN.$(SUFFIX) ctpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ctpmv_thread_RUN.$(SUFFIX) ctpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_RUN.$(SUFFIX) ctpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ctpmv_thread_CUN.$(SUFFIX) ctpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ctpmv_thread_CUN.$(SUFFIX) ctpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ztpmv_thread_NUN.$(SUFFIX) ztpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_NUN.$(SUFFIX) ztpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ztpmv_thread_TLN.$(SUFFIX) ztpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_TLN.$(SUFFIX) ztpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ztpmv_thread_RLN.$(SUFFIX) ztpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_RLN.$(SUFFIX) ztpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ztpmv_thread_CLN.$(SUFFIX) ztpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_CLN.$(SUFFIX) ztpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ztpmv_thread_NLN.$(SUFFIX) ztpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_NLN.$(SUFFIX) ztpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ztpmv_thread_TUN.$(SUFFIX) ztpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_TUN.$(SUFFIX) ztpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ztpmv_thread_RUN.$(SUFFIX) ztpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_RUN.$(SUFFIX) ztpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ztpmv_thread_CUN.$(SUFFIX) ztpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +ztpmv_thread_CUN.$(SUFFIX) ztpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -xtpmv_thread_NUN.$(SUFFIX) xtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_NUN.$(SUFFIX) xtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -xtpmv_thread_TLN.$(SUFFIX) xtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_TLN.$(SUFFIX) xtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -xtpmv_thread_RLN.$(SUFFIX) xtpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_RLN.$(SUFFIX) xtpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -xtpmv_thread_CLN.$(SUFFIX) xtpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_CLN.$(SUFFIX) xtpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -xtpmv_thread_NLN.$(SUFFIX) xtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_NLN.$(SUFFIX) xtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -xtpmv_thread_TUN.$(SUFFIX) xtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_TUN.$(SUFFIX) xtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -xtpmv_thread_RUN.$(SUFFIX) xtpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_RUN.$(SUFFIX) xtpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -xtpmv_thread_CUN.$(SUFFIX) xtpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h +xtpmv_thread_CUN.$(SUFFIX) xtpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -stpsv_NUU.$(SUFFIX) stpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h +stpsv_NUU.$(SUFFIX) stpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stpsv_NUN.$(SUFFIX) stpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h +stpsv_NUN.$(SUFFIX) stpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stpsv_TLU.$(SUFFIX) stpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h +stpsv_TLU.$(SUFFIX) stpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stpsv_TLN.$(SUFFIX) stpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h +stpsv_TLN.$(SUFFIX) stpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -stpsv_NLU.$(SUFFIX) stpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h +stpsv_NLU.$(SUFFIX) stpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -stpsv_NLN.$(SUFFIX) stpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h +stpsv_NLN.$(SUFFIX) stpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -stpsv_TUU.$(SUFFIX) stpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h +stpsv_TUU.$(SUFFIX) stpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -stpsv_TUN.$(SUFFIX) stpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h +stpsv_TUN.$(SUFFIX) stpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtpsv_NUU.$(SUFFIX) dtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h +dtpsv_NUU.$(SUFFIX) dtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtpsv_NUN.$(SUFFIX) dtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h +dtpsv_NUN.$(SUFFIX) dtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtpsv_TLU.$(SUFFIX) dtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h +dtpsv_TLU.$(SUFFIX) dtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtpsv_TLN.$(SUFFIX) dtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h +dtpsv_TLN.$(SUFFIX) dtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtpsv_NLU.$(SUFFIX) dtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h +dtpsv_NLU.$(SUFFIX) dtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtpsv_NLN.$(SUFFIX) dtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h +dtpsv_NLN.$(SUFFIX) dtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtpsv_TUU.$(SUFFIX) dtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h +dtpsv_TUU.$(SUFFIX) dtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtpsv_TUN.$(SUFFIX) dtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h +dtpsv_TUN.$(SUFFIX) dtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtpsv_NUU.$(SUFFIX) qtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h +qtpsv_NUU.$(SUFFIX) qtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtpsv_NUN.$(SUFFIX) qtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h +qtpsv_NUN.$(SUFFIX) qtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtpsv_TLU.$(SUFFIX) qtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h +qtpsv_TLU.$(SUFFIX) qtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtpsv_TLN.$(SUFFIX) qtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h +qtpsv_TLN.$(SUFFIX) qtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtpsv_NLU.$(SUFFIX) qtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h +qtpsv_NLU.$(SUFFIX) qtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtpsv_NLN.$(SUFFIX) qtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h +qtpsv_NLN.$(SUFFIX) qtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtpsv_TUU.$(SUFFIX) qtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h +qtpsv_TUU.$(SUFFIX) qtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtpsv_TUN.$(SUFFIX) qtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h +qtpsv_TUN.$(SUFFIX) qtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -ctpsv_NUU.$(SUFFIX) ctpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_NUU.$(SUFFIX) ctpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ctpsv_NUN.$(SUFFIX) ctpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_NUN.$(SUFFIX) ctpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ctpsv_TLU.$(SUFFIX) ctpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_TLU.$(SUFFIX) ctpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ctpsv_TLN.$(SUFFIX) ctpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_TLN.$(SUFFIX) ctpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ctpsv_RLU.$(SUFFIX) ctpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_RLU.$(SUFFIX) ctpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ctpsv_RLN.$(SUFFIX) ctpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_RLN.$(SUFFIX) ctpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ctpsv_CLU.$(SUFFIX) ctpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_CLU.$(SUFFIX) ctpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ctpsv_CLN.$(SUFFIX) ctpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_CLN.$(SUFFIX) ctpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -ctpsv_NLU.$(SUFFIX) ctpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_NLU.$(SUFFIX) ctpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ctpsv_NLN.$(SUFFIX) ctpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_NLN.$(SUFFIX) ctpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ctpsv_TUU.$(SUFFIX) ctpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_TUU.$(SUFFIX) ctpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ctpsv_TUN.$(SUFFIX) ctpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_TUN.$(SUFFIX) ctpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ctpsv_RUU.$(SUFFIX) ctpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_RUU.$(SUFFIX) ctpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ctpsv_RUN.$(SUFFIX) ctpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ctpsv_RUN.$(SUFFIX) ctpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ctpsv_CUU.$(SUFFIX) ctpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_CUU.$(SUFFIX) ctpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ctpsv_CUN.$(SUFFIX) ctpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ctpsv_CUN.$(SUFFIX) ctpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -ztpsv_NUU.$(SUFFIX) ztpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_NUU.$(SUFFIX) ztpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ztpsv_NUN.$(SUFFIX) ztpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_NUN.$(SUFFIX) ztpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ztpsv_TLU.$(SUFFIX) ztpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_TLU.$(SUFFIX) ztpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ztpsv_TLN.$(SUFFIX) ztpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_TLN.$(SUFFIX) ztpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ztpsv_RLU.$(SUFFIX) ztpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_RLU.$(SUFFIX) ztpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ztpsv_RLN.$(SUFFIX) ztpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_RLN.$(SUFFIX) ztpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ztpsv_CLU.$(SUFFIX) ztpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_CLU.$(SUFFIX) ztpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ztpsv_CLN.$(SUFFIX) ztpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_CLN.$(SUFFIX) ztpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -ztpsv_NLU.$(SUFFIX) ztpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_NLU.$(SUFFIX) ztpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ztpsv_NLN.$(SUFFIX) ztpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_NLN.$(SUFFIX) ztpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ztpsv_TUU.$(SUFFIX) ztpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_TUU.$(SUFFIX) ztpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ztpsv_TUN.$(SUFFIX) ztpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_TUN.$(SUFFIX) ztpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ztpsv_RUU.$(SUFFIX) ztpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_RUU.$(SUFFIX) ztpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ztpsv_RUN.$(SUFFIX) ztpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h +ztpsv_RUN.$(SUFFIX) ztpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ztpsv_CUU.$(SUFFIX) ztpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_CUU.$(SUFFIX) ztpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ztpsv_CUN.$(SUFFIX) ztpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h +ztpsv_CUN.$(SUFFIX) ztpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -xtpsv_NUU.$(SUFFIX) xtpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_NUU.$(SUFFIX) xtpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -xtpsv_NUN.$(SUFFIX) xtpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_NUN.$(SUFFIX) xtpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -xtpsv_TLU.$(SUFFIX) xtpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_TLU.$(SUFFIX) xtpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -xtpsv_TLN.$(SUFFIX) xtpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_TLN.$(SUFFIX) xtpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -xtpsv_RLU.$(SUFFIX) xtpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_RLU.$(SUFFIX) xtpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -xtpsv_RLN.$(SUFFIX) xtpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_RLN.$(SUFFIX) xtpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -xtpsv_CLU.$(SUFFIX) xtpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_CLU.$(SUFFIX) xtpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -xtpsv_CLN.$(SUFFIX) xtpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_CLN.$(SUFFIX) xtpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -xtpsv_NLU.$(SUFFIX) xtpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_NLU.$(SUFFIX) xtpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -xtpsv_NLN.$(SUFFIX) xtpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_NLN.$(SUFFIX) xtpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -xtpsv_TUU.$(SUFFIX) xtpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_TUU.$(SUFFIX) xtpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -xtpsv_TUN.$(SUFFIX) xtpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_TUN.$(SUFFIX) xtpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -xtpsv_RUU.$(SUFFIX) xtpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_RUU.$(SUFFIX) xtpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -xtpsv_RUN.$(SUFFIX) xtpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h +xtpsv_RUN.$(SUFFIX) xtpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -xtpsv_CUU.$(SUFFIX) xtpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_CUU.$(SUFFIX) xtpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -xtpsv_CUN.$(SUFFIX) xtpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h +xtpsv_CUN.$(SUFFIX) xtpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -strmv_NUU.$(SUFFIX) strmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h +strmv_NUU.$(SUFFIX) strmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -strmv_NUN.$(SUFFIX) strmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h +strmv_NUN.$(SUFFIX) strmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -strmv_TLU.$(SUFFIX) strmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h +strmv_TLU.$(SUFFIX) strmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -strmv_TLN.$(SUFFIX) strmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h +strmv_TLN.$(SUFFIX) strmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -strmv_NLU.$(SUFFIX) strmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h +strmv_NLU.$(SUFFIX) strmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -strmv_NLN.$(SUFFIX) strmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h +strmv_NLN.$(SUFFIX) strmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -strmv_TUU.$(SUFFIX) strmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h +strmv_TUU.$(SUFFIX) strmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -strmv_TUN.$(SUFFIX) strmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h +strmv_TUN.$(SUFFIX) strmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtrmv_NUU.$(SUFFIX) dtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h +dtrmv_NUU.$(SUFFIX) dtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtrmv_NUN.$(SUFFIX) dtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h +dtrmv_NUN.$(SUFFIX) dtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtrmv_TLU.$(SUFFIX) dtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h +dtrmv_TLU.$(SUFFIX) dtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtrmv_TLN.$(SUFFIX) dtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h +dtrmv_TLN.$(SUFFIX) dtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtrmv_NLU.$(SUFFIX) dtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h +dtrmv_NLU.$(SUFFIX) dtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtrmv_NLN.$(SUFFIX) dtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h +dtrmv_NLN.$(SUFFIX) dtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtrmv_TUU.$(SUFFIX) dtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h +dtrmv_TUU.$(SUFFIX) dtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtrmv_TUN.$(SUFFIX) dtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h +dtrmv_TUN.$(SUFFIX) dtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtrmv_NUU.$(SUFFIX) qtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h +qtrmv_NUU.$(SUFFIX) qtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtrmv_NUN.$(SUFFIX) qtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h +qtrmv_NUN.$(SUFFIX) qtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtrmv_TLU.$(SUFFIX) qtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h +qtrmv_TLU.$(SUFFIX) qtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtrmv_TLN.$(SUFFIX) qtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h +qtrmv_TLN.$(SUFFIX) qtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtrmv_NLU.$(SUFFIX) qtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h +qtrmv_NLU.$(SUFFIX) qtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtrmv_NLN.$(SUFFIX) qtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h +qtrmv_NLN.$(SUFFIX) qtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtrmv_TUU.$(SUFFIX) qtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h +qtrmv_TUU.$(SUFFIX) qtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtrmv_TUN.$(SUFFIX) qtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h +qtrmv_TUN.$(SUFFIX) qtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -ctrmv_NUU.$(SUFFIX) ctrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_NUU.$(SUFFIX) ctrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctrmv_NUN.$(SUFFIX) ctrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_NUN.$(SUFFIX) ctrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctrmv_TLU.$(SUFFIX) ctrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_TLU.$(SUFFIX) ctrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctrmv_TLN.$(SUFFIX) ctrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_TLN.$(SUFFIX) ctrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctrmv_RLU.$(SUFFIX) ctrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_RLU.$(SUFFIX) ctrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctrmv_RLN.$(SUFFIX) ctrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_RLN.$(SUFFIX) ctrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctrmv_CLU.$(SUFFIX) ctrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_CLU.$(SUFFIX) ctrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctrmv_CLN.$(SUFFIX) ctrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_CLN.$(SUFFIX) ctrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ctrmv_NLU.$(SUFFIX) ctrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_NLU.$(SUFFIX) ctrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ctrmv_NLN.$(SUFFIX) ctrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_NLN.$(SUFFIX) ctrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ctrmv_TUU.$(SUFFIX) ctrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_TUU.$(SUFFIX) ctrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ctrmv_TUN.$(SUFFIX) ctrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_TUN.$(SUFFIX) ctrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ctrmv_RUU.$(SUFFIX) ctrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_RUU.$(SUFFIX) ctrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ctrmv_RUN.$(SUFFIX) ctrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ctrmv_RUN.$(SUFFIX) ctrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ctrmv_CUU.$(SUFFIX) ctrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_CUU.$(SUFFIX) ctrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ctrmv_CUN.$(SUFFIX) ctrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ctrmv_CUN.$(SUFFIX) ctrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztrmv_NUU.$(SUFFIX) ztrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_NUU.$(SUFFIX) ztrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztrmv_NUN.$(SUFFIX) ztrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_NUN.$(SUFFIX) ztrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztrmv_TLU.$(SUFFIX) ztrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_TLU.$(SUFFIX) ztrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztrmv_TLN.$(SUFFIX) ztrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_TLN.$(SUFFIX) ztrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztrmv_RLU.$(SUFFIX) ztrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_RLU.$(SUFFIX) ztrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztrmv_RLN.$(SUFFIX) ztrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_RLN.$(SUFFIX) ztrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztrmv_CLU.$(SUFFIX) ztrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_CLU.$(SUFFIX) ztrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztrmv_CLN.$(SUFFIX) ztrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_CLN.$(SUFFIX) ztrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -ztrmv_NLU.$(SUFFIX) ztrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_NLU.$(SUFFIX) ztrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -ztrmv_NLN.$(SUFFIX) ztrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_NLN.$(SUFFIX) ztrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -ztrmv_TUU.$(SUFFIX) ztrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_TUU.$(SUFFIX) ztrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -ztrmv_TUN.$(SUFFIX) ztrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_TUN.$(SUFFIX) ztrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -ztrmv_RUU.$(SUFFIX) ztrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_RUU.$(SUFFIX) ztrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -ztrmv_RUN.$(SUFFIX) ztrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h +ztrmv_RUN.$(SUFFIX) ztrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -ztrmv_CUU.$(SUFFIX) ztrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_CUU.$(SUFFIX) ztrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -ztrmv_CUN.$(SUFFIX) ztrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h +ztrmv_CUN.$(SUFFIX) ztrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtrmv_NUU.$(SUFFIX) xtrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_NUU.$(SUFFIX) xtrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtrmv_NUN.$(SUFFIX) xtrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_NUN.$(SUFFIX) xtrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtrmv_TLU.$(SUFFIX) xtrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_TLU.$(SUFFIX) xtrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtrmv_TLN.$(SUFFIX) xtrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_TLN.$(SUFFIX) xtrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtrmv_RLU.$(SUFFIX) xtrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_RLU.$(SUFFIX) xtrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtrmv_RLN.$(SUFFIX) xtrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_RLN.$(SUFFIX) xtrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtrmv_CLU.$(SUFFIX) xtrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_CLU.$(SUFFIX) xtrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtrmv_CLN.$(SUFFIX) xtrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_CLN.$(SUFFIX) xtrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -xtrmv_NLU.$(SUFFIX) xtrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_NLU.$(SUFFIX) xtrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) -xtrmv_NLN.$(SUFFIX) xtrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_NLN.$(SUFFIX) xtrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) -xtrmv_TUU.$(SUFFIX) xtrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_TUU.$(SUFFIX) xtrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) -xtrmv_TUN.$(SUFFIX) xtrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_TUN.$(SUFFIX) xtrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) -xtrmv_RUU.$(SUFFIX) xtrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_RUU.$(SUFFIX) xtrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) -xtrmv_RUN.$(SUFFIX) xtrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h +xtrmv_RUN.$(SUFFIX) xtrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) -xtrmv_CUU.$(SUFFIX) xtrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_CUU.$(SUFFIX) xtrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) -xtrmv_CUN.$(SUFFIX) xtrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h +xtrmv_CUN.$(SUFFIX) xtrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) -strmv_thread_NUU.$(SUFFIX) strmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_NUU.$(SUFFIX) strmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -strmv_thread_NUN.$(SUFFIX) strmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_NUN.$(SUFFIX) strmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -strmv_thread_TLU.$(SUFFIX) strmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_TLU.$(SUFFIX) strmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -strmv_thread_TLN.$(SUFFIX) strmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_TLN.$(SUFFIX) strmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -strmv_thread_NLU.$(SUFFIX) strmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_NLU.$(SUFFIX) strmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -strmv_thread_NLN.$(SUFFIX) strmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_NLN.$(SUFFIX) strmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -strmv_thread_TUU.$(SUFFIX) strmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_TUU.$(SUFFIX) strmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -strmv_thread_TUN.$(SUFFIX) strmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h +strmv_thread_TUN.$(SUFFIX) strmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -dtrmv_thread_NUN.$(SUFFIX) dtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_NUN.$(SUFFIX) dtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -dtrmv_thread_TLN.$(SUFFIX) dtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_TLN.$(SUFFIX) dtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -dtrmv_thread_NLN.$(SUFFIX) dtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_NLN.$(SUFFIX) dtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -dtrmv_thread_TUN.$(SUFFIX) dtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h +dtrmv_thread_TUN.$(SUFFIX) dtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) -qtrmv_thread_NUN.$(SUFFIX) qtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_NUN.$(SUFFIX) qtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) -qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) -qtrmv_thread_TLN.$(SUFFIX) qtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_TLN.$(SUFFIX) qtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) -qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) -qtrmv_thread_NLN.$(SUFFIX) qtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_NLN.$(SUFFIX) qtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) -qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) -qtrmv_thread_TUN.$(SUFFIX) qtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h +qtrmv_thread_TUN.$(SUFFIX) qtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) -ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ctrmv_thread_NUN.$(SUFFIX) ctrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_NUN.$(SUFFIX) ctrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ctrmv_thread_TLN.$(SUFFIX) ctrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_TLN.$(SUFFIX) ctrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ctrmv_thread_RLN.$(SUFFIX) ctrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_RLN.$(SUFFIX) ctrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ctrmv_thread_CLN.$(SUFFIX) ctrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_CLN.$(SUFFIX) ctrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ctrmv_thread_NLN.$(SUFFIX) ctrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_NLN.$(SUFFIX) ctrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ctrmv_thread_TUN.$(SUFFIX) ctrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_TUN.$(SUFFIX) ctrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ctrmv_thread_RUN.$(SUFFIX) ctrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_RUN.$(SUFFIX) ctrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ctrmv_thread_CUN.$(SUFFIX) ctrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ctrmv_thread_CUN.$(SUFFIX) ctrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ztrmv_thread_NUN.$(SUFFIX) ztrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_NUN.$(SUFFIX) ztrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ztrmv_thread_TLN.$(SUFFIX) ztrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_TLN.$(SUFFIX) ztrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ztrmv_thread_RLN.$(SUFFIX) ztrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_RLN.$(SUFFIX) ztrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ztrmv_thread_CLN.$(SUFFIX) ztrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_CLN.$(SUFFIX) ztrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -ztrmv_thread_NLN.$(SUFFIX) ztrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_NLN.$(SUFFIX) ztrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -ztrmv_thread_TUN.$(SUFFIX) ztrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_TUN.$(SUFFIX) ztrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -ztrmv_thread_RUN.$(SUFFIX) ztrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_RUN.$(SUFFIX) ztrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -ztrmv_thread_CUN.$(SUFFIX) ztrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h +ztrmv_thread_CUN.$(SUFFIX) ztrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) -xtrmv_thread_NUN.$(SUFFIX) xtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_NUN.$(SUFFIX) xtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) -xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) -xtrmv_thread_TLN.$(SUFFIX) xtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_TLN.$(SUFFIX) xtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) -xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) -xtrmv_thread_RLN.$(SUFFIX) xtrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_RLN.$(SUFFIX) xtrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) -xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) -xtrmv_thread_CLN.$(SUFFIX) xtrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_CLN.$(SUFFIX) xtrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) -xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) -xtrmv_thread_NLN.$(SUFFIX) xtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_NLN.$(SUFFIX) xtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) -xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) -xtrmv_thread_TUN.$(SUFFIX) xtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_TUN.$(SUFFIX) xtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) -xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) -xtrmv_thread_RUN.$(SUFFIX) xtrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_RUN.$(SUFFIX) xtrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) -xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) -xtrmv_thread_CUN.$(SUFFIX) xtrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h +xtrmv_thread_CUN.$(SUFFIX) xtrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) -strsv_NUU.$(SUFFIX) strsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h +strsv_NUU.$(SUFFIX) strsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -strsv_NUN.$(SUFFIX) strsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h +strsv_NUN.$(SUFFIX) strsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -strsv_TLU.$(SUFFIX) strsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h +strsv_TLU.$(SUFFIX) strsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -strsv_TLN.$(SUFFIX) strsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h +strsv_TLN.$(SUFFIX) strsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -strsv_NLU.$(SUFFIX) strsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h +strsv_NLU.$(SUFFIX) strsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) -strsv_NLN.$(SUFFIX) strsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h +strsv_NLN.$(SUFFIX) strsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) -strsv_TUU.$(SUFFIX) strsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h +strsv_TUU.$(SUFFIX) strsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) -strsv_TUN.$(SUFFIX) strsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h +strsv_TUN.$(SUFFIX) strsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtrsv_NUU.$(SUFFIX) dtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h +dtrsv_NUU.$(SUFFIX) dtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtrsv_NUN.$(SUFFIX) dtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h +dtrsv_NUN.$(SUFFIX) dtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtrsv_TLU.$(SUFFIX) dtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h +dtrsv_TLU.$(SUFFIX) dtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtrsv_TLN.$(SUFFIX) dtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h +dtrsv_TLN.$(SUFFIX) dtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -dtrsv_NLU.$(SUFFIX) dtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h +dtrsv_NLU.$(SUFFIX) dtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) -dtrsv_NLN.$(SUFFIX) dtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h +dtrsv_NLN.$(SUFFIX) dtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) -dtrsv_TUU.$(SUFFIX) dtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h +dtrsv_TUU.$(SUFFIX) dtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) -dtrsv_TUN.$(SUFFIX) dtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h +dtrsv_TUN.$(SUFFIX) dtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtrsv_NUU.$(SUFFIX) qtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h +qtrsv_NUU.$(SUFFIX) qtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtrsv_NUN.$(SUFFIX) qtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h +qtrsv_NUN.$(SUFFIX) qtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtrsv_TLU.$(SUFFIX) qtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h +qtrsv_TLU.$(SUFFIX) qtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtrsv_TLN.$(SUFFIX) qtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h +qtrsv_TLN.$(SUFFIX) qtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -qtrsv_NLU.$(SUFFIX) qtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h +qtrsv_NLU.$(SUFFIX) qtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) -qtrsv_NLN.$(SUFFIX) qtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h +qtrsv_NLN.$(SUFFIX) qtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) -qtrsv_TUU.$(SUFFIX) qtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h +qtrsv_TUU.$(SUFFIX) qtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) -qtrsv_TUN.$(SUFFIX) qtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h +qtrsv_TUN.$(SUFFIX) qtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) -ctrsv_NUU.$(SUFFIX) ctrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_NUU.$(SUFFIX) ctrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ctrsv_NUN.$(SUFFIX) ctrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_NUN.$(SUFFIX) ctrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ctrsv_TLU.$(SUFFIX) ctrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_TLU.$(SUFFIX) ctrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ctrsv_TLN.$(SUFFIX) ctrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_TLN.$(SUFFIX) ctrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ctrsv_RLU.$(SUFFIX) ctrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_RLU.$(SUFFIX) ctrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ctrsv_RLN.$(SUFFIX) ctrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_RLN.$(SUFFIX) ctrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ctrsv_CLU.$(SUFFIX) ctrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_CLU.$(SUFFIX) ctrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ctrsv_CLN.$(SUFFIX) ctrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_CLN.$(SUFFIX) ctrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -ctrsv_NLU.$(SUFFIX) ctrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_NLU.$(SUFFIX) ctrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ctrsv_NLN.$(SUFFIX) ctrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_NLN.$(SUFFIX) ctrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ctrsv_TUU.$(SUFFIX) ctrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_TUU.$(SUFFIX) ctrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ctrsv_TUN.$(SUFFIX) ctrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_TUN.$(SUFFIX) ctrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ctrsv_RUU.$(SUFFIX) ctrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_RUU.$(SUFFIX) ctrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ctrsv_RUN.$(SUFFIX) ctrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ctrsv_RUN.$(SUFFIX) ctrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ctrsv_CUU.$(SUFFIX) ctrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_CUU.$(SUFFIX) ctrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ctrsv_CUN.$(SUFFIX) ctrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ctrsv_CUN.$(SUFFIX) ctrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -ztrsv_NUU.$(SUFFIX) ztrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_NUU.$(SUFFIX) ztrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ztrsv_NUN.$(SUFFIX) ztrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_NUN.$(SUFFIX) ztrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ztrsv_TLU.$(SUFFIX) ztrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_TLU.$(SUFFIX) ztrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ztrsv_TLN.$(SUFFIX) ztrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_TLN.$(SUFFIX) ztrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ztrsv_RLU.$(SUFFIX) ztrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_RLU.$(SUFFIX) ztrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ztrsv_RLN.$(SUFFIX) ztrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_RLN.$(SUFFIX) ztrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ztrsv_CLU.$(SUFFIX) ztrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_CLU.$(SUFFIX) ztrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ztrsv_CLN.$(SUFFIX) ztrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_CLN.$(SUFFIX) ztrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -ztrsv_NLU.$(SUFFIX) ztrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_NLU.$(SUFFIX) ztrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -ztrsv_NLN.$(SUFFIX) ztrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_NLN.$(SUFFIX) ztrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -ztrsv_TUU.$(SUFFIX) ztrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_TUU.$(SUFFIX) ztrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -ztrsv_TUN.$(SUFFIX) ztrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_TUN.$(SUFFIX) ztrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -ztrsv_RUU.$(SUFFIX) ztrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_RUU.$(SUFFIX) ztrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -ztrsv_RUN.$(SUFFIX) ztrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h +ztrsv_RUN.$(SUFFIX) ztrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -ztrsv_CUU.$(SUFFIX) ztrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_CUU.$(SUFFIX) ztrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -ztrsv_CUN.$(SUFFIX) ztrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h +ztrsv_CUN.$(SUFFIX) ztrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -xtrsv_NUU.$(SUFFIX) xtrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_NUU.$(SUFFIX) xtrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -xtrsv_NUN.$(SUFFIX) xtrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_NUN.$(SUFFIX) xtrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -xtrsv_TLU.$(SUFFIX) xtrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_TLU.$(SUFFIX) xtrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -xtrsv_TLN.$(SUFFIX) xtrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_TLN.$(SUFFIX) xtrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -xtrsv_RLU.$(SUFFIX) xtrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_RLU.$(SUFFIX) xtrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -xtrsv_RLN.$(SUFFIX) xtrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_RLN.$(SUFFIX) xtrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -xtrsv_CLU.$(SUFFIX) xtrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_CLU.$(SUFFIX) xtrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -xtrsv_CLN.$(SUFFIX) xtrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_CLN.$(SUFFIX) xtrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) -xtrsv_NLU.$(SUFFIX) xtrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_NLU.$(SUFFIX) xtrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) -xtrsv_NLN.$(SUFFIX) xtrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_NLN.$(SUFFIX) xtrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) -xtrsv_TUU.$(SUFFIX) xtrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_TUU.$(SUFFIX) xtrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) -xtrsv_TUN.$(SUFFIX) xtrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_TUN.$(SUFFIX) xtrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) -xtrsv_RUU.$(SUFFIX) xtrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_RUU.$(SUFFIX) xtrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) -xtrsv_RUN.$(SUFFIX) xtrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h +xtrsv_RUN.$(SUFFIX) xtrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) -xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) -xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h +xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) include ../../Makefile.tail diff --git a/driver/level2/gbmv_k.c b/driver/level2/gbmv_k.c index 317d42047..4b29d70d1 100644 --- a/driver/level2/gbmv_k.c +++ b/driver/level2/gbmv_k.c @@ -84,12 +84,12 @@ void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, #ifndef TRANS AXPYU_K(length, 0, 0, - alpha * X[i], + alpha * X[i], a + start, 1, Y + start - offset_u, 1, NULL, 0); #else Y[i] += alpha * DOTU_K(length, a + start, 1, X + start - offset_u, 1); #endif - + offset_u --; offset_l --; diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index 18aae26ae..9efe17092 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -105,13 +105,13 @@ static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F args -> m, #else args -> n, -#endif - 0, 0, ZERO, +#endif + 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y, 1, NULL, 0, NULL, 0); - + y, 1, NULL, 0, NULL, 0); + offset_u = ku - n_from; offset_l = ku - n_from + args -> m; @@ -157,7 +157,7 @@ static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F x += COMPSIZE; #endif - + y += COMPSIZE; offset_u --; @@ -190,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -198,27 +198,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; args.n = n; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; - + args.lda = lda; args.ldb = incx; args.ldc = ku; args.ldd = kl; num_cpu = 0; - + range_n[0] = 0; i = n; - + while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); @@ -227,7 +227,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT if (i < width) width = i; range_n[num_cpu + 1] = range_n[num_cpu] + width; - + #ifndef TRANSA range_m[num_cpu] = num_cpu * ((m + 15) & ~15); #else @@ -242,7 +242,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i -= width; } @@ -254,12 +254,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT #else queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; #endif - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + for (i = 1; i < num_cpu; i ++) { AXPYU_K( #ifndef TRANSA diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c index 5f8abf26f..ddd475367 100644 --- a/driver/level2/gemv_thread.c +++ b/driver/level2/gemv_thread.c @@ -110,7 +110,7 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F *((FLOAT *)args -> alpha + 1), #endif a, lda, x, incx, y, incy, buffer); - + return 0; } @@ -134,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -142,17 +142,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; args.n = n; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)y; - + args.lda = lda; args.ldb = incx; args.ldc = incy; @@ -164,14 +164,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #endif num_cpu = 0; - + range[0] = 0; #ifndef TRANSA i = m; #else i = n; #endif - + while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); @@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x if (i < width) width = i; range[num_cpu + 1] = range[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = gemv_kernel; queue[num_cpu].args = &args; @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i -= width; } @@ -202,9 +202,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level2/ger_thread.c b/driver/level2/ger_thread.c index 9e2f520ef..0a5e14cef 100644 --- a/driver/level2/ger_thread.c +++ b/driver/level2/ger_thread.c @@ -102,7 +102,7 @@ static int ger_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #endif #endif x, 1, a, 1, NULL, 0); - + y += incy * COMPSIZE; a += lda * COMPSIZE; } @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT * int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -138,17 +138,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT * int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; args.n = n; - + args.a = (void *)x; args.b = (void *)y; args.c = (void *)a; - + args.lda = incx; args.ldb = incy; args.ldc = lda; @@ -160,18 +160,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT * #endif num_cpu = 0; - + range_n[0] = 0; i = n; - + while (i > 0){ - + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range_n[num_cpu + 1] = range_n[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = ger_kernel; queue[num_cpu].args = &args; @@ -179,19 +179,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT * queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i -= width; } - + if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level2/sbmv_k.c b/driver/level2/sbmv_k.c index d0adc678a..ef7fa378c 100644 --- a/driver/level2/sbmv_k.c +++ b/driver/level2/sbmv_k.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, if (length > k) length = k; AXPYU_K(length + 1, 0, 0, - alpha * X[i], + alpha * X[i], a + k - length, 1, Y + i - length, 1, NULL, 0); Y[i] += alpha * DOTU_K(length, a + k - length, 1, X + i - length, 1); #else @@ -80,11 +80,11 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, if (n - i - 1 < k) length = n - i - 1; AXPYU_K(length + 1, 0, 0, - alpha * X[i], + alpha * X[i], a, 1, Y + i, 1, NULL, 0); Y[i] += alpha * DOTU_K(length, a + 1, 1, X + i + 1, 1); #endif - + a += lda; } diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 7dfabfa81..5b7fc7332 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -76,7 +76,7 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F n_to = n; //Use y as each thread's n* COMPSIZE elements in sb buffer - y = buffer; + y = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); if (range_m) { @@ -94,12 +94,12 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F buffer += ((COMPSIZE * n + 1023) & ~1023); } - SCAL_K(n, 0, 0, ZERO, + SCAL_K(n, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y, 1, NULL, 0, NULL, 0); - + y, 1, NULL, 0, NULL, 0); + for (i = n_from; i < n_to; i++) { #ifndef LOWER @@ -193,7 +193,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -201,52 +201,52 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.n = n; args.k = k; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; - + args.lda = lda; args.ldb = incx; args.ldc = incy; dnum = (double)n * (double)n / (double)nthreads; num_cpu = 0; - + if (n < 2 * k) { #ifndef LOWER range_m[MAX_CPU_NUMBER] = n; i = 0; - + while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } - + if (width < 16) width = 16; if (width > n - i) width = n - i; - + } else { width = n - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; @@ -255,37 +255,37 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else - + range_m[0] = 0; i = 0; - + while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } - + if (width < 16) width = 16; if (width > n - i) width = n - i; - + } else { width = n - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; @@ -294,29 +294,29 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif - + } else { - + range_m[0] = 0; i = n; - + while (i > 0){ - + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); - + if (width < 4) width = 4; if (i < width) width = i; - + range_m[num_cpu + 1] = range_m[num_cpu] + width; - + range_n[num_cpu] = num_cpu * ((n + 15) & ~15); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; @@ -325,7 +325,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i -= width; } @@ -335,10 +335,10 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + for (i = 1; i < num_cpu; i ++) { AXPYU_K(n, 0, 0, #ifndef COMPLEX @@ -356,6 +356,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); - + return 0; } diff --git a/driver/level2/spmv_k.c b/driver/level2/spmv_k.c index 07ec66095..8ce0abdf7 100644 --- a/driver/level2/spmv_k.c +++ b/driver/level2/spmv_k.c @@ -68,7 +68,7 @@ int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, if (i > 0) Y[i] += alpha * DOTU_K(i, a, 1, X, 1); AXPYU_K(i + 1, 0, 0, alpha * X[i], a, 1, Y, 1, NULL, 0); a += i + 1; - + #else Y[i] += alpha * DOTU_K(m - i, a + i, 1, X + i, 1); if (m - i > 1) AXPYU_K(m - i - 1, 0, 0, alpha * X[i], diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index 7717bbf2b..93a2f44d4 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -91,17 +91,17 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F } #ifndef LOWER - SCAL_K(m_to, 0, 0, ZERO, + SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y, 1, NULL, 0, NULL, 0); + y, 1, NULL, 0, NULL, 0); #else - SCAL_K(args -> m - m_from, 0, 0, ZERO, + SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER @@ -139,7 +139,7 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a, 1, y, 1, NULL, 0); a += (i + 1) * COMPSIZE; - + #else #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1); @@ -198,7 +198,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -206,31 +206,31 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; - + args.ldb = incx; args.ldc = incy; dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -240,14 +240,14 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = spmv_kernel; queue[num_cpu].args = &args; @@ -256,20 +256,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -279,14 +279,14 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = spmv_kernel; queue[num_cpu].args = &args; @@ -295,44 +295,44 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + for (i = 1; i < num_cpu; i ++) { - + #ifndef LOWER - + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); - + #else - + AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); - + #endif - + } - + AXPYU_K(m, 0, 0, #ifndef COMPLEX alpha, @@ -340,6 +340,6 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); - + return 0; } diff --git a/driver/level2/spr2_k.c b/driver/level2/spr2_k.c index 58e14ebe2..e742b246f 100644 --- a/driver/level2/spr2_k.c +++ b/driver/level2/spr2_k.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ BLASLONG i; diff --git a/driver/level2/spr2_thread.c b/driver/level2/spr2_thread.c index b20eb055a..10edb1eb1 100644 --- a/driver/level2/spr2_thread.c +++ b/driver/level2/spr2_thread.c @@ -116,7 +116,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); @@ -129,7 +129,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -145,7 +145,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); @@ -157,7 +157,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); @@ -172,7 +172,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -184,7 +184,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -202,14 +202,14 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL a[ 1] = ZERO; #endif #endif - + #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i) * COMPSIZE; #endif } - + return 0; } @@ -236,7 +236,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -244,16 +244,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)x; args.b = (void *)y; args.c = (void *)a; - + args.lda = incx; args.ldb = incy; #ifndef COMPLEX @@ -264,16 +264,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -283,13 +283,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -298,20 +298,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -321,13 +321,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -336,21 +336,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level2/spr_k.c b/driver/level2/spr_k.c index 996d9257e..84fb4e8fa 100644 --- a/driver/level2/spr_k.c +++ b/driver/level2/spr_k.c @@ -38,7 +38,7 @@ #include "common.h" -int CNAME(BLASLONG m, FLOAT alpha_r, +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ BLASLONG i; diff --git a/driver/level2/spr_thread.c b/driver/level2/spr_thread.c index f88950698..4a194cbd6 100644 --- a/driver/level2/spr_thread.c +++ b/driver/level2/spr_thread.c @@ -96,7 +96,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -112,7 +112,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else @@ -122,7 +122,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else @@ -145,7 +145,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL a += (args -> m - i) * COMPSIZE; #endif } - + return 0; } @@ -172,7 +172,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -180,15 +180,15 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)x; args.b = (void *)a; - + args.lda = incx; #if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV) @@ -199,16 +199,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -218,13 +218,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -233,20 +233,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -256,13 +256,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -271,21 +271,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *bu queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index cf0e2d0c0..95d6c9bb5 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -78,11 +78,11 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifndef LOWER - SCAL_K(m_to, 0, 0, ZERO, + SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y, 1, NULL, 0, NULL, 0); + y, 1, NULL, 0, NULL, 0); MYSYMV_U (m_to, m_to - m_from, ONE, #ifdef COMPLEX @@ -92,11 +92,11 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #else - SCAL_K(args -> m - m_from, 0, 0, ZERO, + SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); MYSYMV_L (args -> m - m_from, m_to - m_from, ONE, #ifdef COMPLEX @@ -132,7 +132,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -140,45 +140,45 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; - + args.lda = lda; args.ldb = incx; args.ldc = incy; dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)i; width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; if (width < 4) width = 4; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; @@ -187,29 +187,29 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i queue[MAX_CPU_NUMBER - num_cpu - 1].sa = NULL; queue[MAX_CPU_NUMBER - num_cpu - 1].sb = NULL; queue[MAX_CPU_NUMBER - num_cpu - 1].next = &queue[MAX_CPU_NUMBER - num_cpu]; - + num_cpu ++; i += width; } - + if (num_cpu) { queue[MAX_CPU_NUMBER - num_cpu].sa = NULL; queue[MAX_CPU_NUMBER - num_cpu].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; - + queue[MAX_CPU_NUMBER - 1].next = NULL; - + exec_blas(num_cpu, &queue[MAX_CPU_NUMBER - num_cpu]); } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -219,14 +219,14 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i if (width < 4) width = 4; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = symv_kernel; queue[num_cpu].args = &args; @@ -235,32 +235,32 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + #endif #ifndef LOWER for (i = 0; i < num_cpu - 1; i ++) { - + AXPYU_K(range_m[i + 1], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer + range_n[num_cpu - 1] * COMPSIZE, 1, NULL, 0); - } + } AXPYU_K(m, 0, 0, #ifndef COMPLEX @@ -271,12 +271,12 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i buffer + range_n[num_cpu - 1] * COMPSIZE, 1, y, incy, NULL, 0); #else - + for (i = 1; i < num_cpu; i ++) { AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); } @@ -288,8 +288,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); - + #endif - + return 0; } diff --git a/driver/level2/syr2_k.c b/driver/level2/syr2_k.c index bca8b3bca..5bbd47bdf 100644 --- a/driver/level2/syr2_k.c +++ b/driver/level2/syr2_k.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; diff --git a/driver/level2/syr2_thread.c b/driver/level2/syr2_thread.c index 130a62d3e..4c3294493 100644 --- a/driver/level2/syr2_thread.c +++ b/driver/level2/syr2_thread.c @@ -112,7 +112,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); @@ -125,7 +125,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -141,7 +141,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HERREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); @@ -153,7 +153,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); @@ -168,7 +168,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef HERREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -180,7 +180,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -197,7 +197,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL a += lda * COMPSIZE; } - + return 0; } @@ -224,7 +224,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -232,16 +232,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)x; args.b = (void *)y; args.c = (void *)a; - + args.lda = incx; args.ldb = incy; args.ldc = lda; @@ -253,16 +253,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -272,13 +272,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -287,20 +287,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -310,13 +310,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -325,21 +325,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level2/syr_k.c b/driver/level2/syr_k.c index a0d9a2fa0..4f18cc6d0 100644 --- a/driver/level2/syr_k.c +++ b/driver/level2/syr_k.c @@ -38,7 +38,7 @@ #include "common.h" -int CNAME(BLASLONG m, FLOAT alpha_r, +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; diff --git a/driver/level2/syr_thread.c b/driver/level2/syr_thread.c index 250e8c006..0eb54289f 100644 --- a/driver/level2/syr_thread.c +++ b/driver/level2/syr_thread.c @@ -95,7 +95,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); @@ -111,7 +111,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HERREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else @@ -121,7 +121,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else @@ -137,7 +137,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL a += lda * COMPSIZE; } - + return 0; } @@ -164,7 +164,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -172,15 +172,15 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)x; args.b = (void *)a; - + args.lda = incx; args.ldb = lda; #if !defined(COMPLEX) || defined(HER) || defined(HERREV) @@ -191,16 +191,16 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -210,13 +210,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -225,20 +225,20 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -248,13 +248,13 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = syr_kernel; queue[num_cpu].args = &args; @@ -263,21 +263,21 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level2/tbmv_L.c b/driver/level2/tbmv_L.c index 05e7cf869..b41b4141e 100644 --- a/driver/level2/tbmv_L.c +++ b/driver/level2/tbmv_L.c @@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } - + a += (n - 1) * lda; for (i = n - 1; i >= 0; i--) { @@ -65,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (length > 0) { AXPYU_K(length, 0, 0, - B[i], + B[i], a + 1, 1, B + i + 1, 1, NULL, 0); } #endif @@ -77,7 +77,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc B[i] *= a[k]; #endif #endif - + #ifdef TRANSA length = i; if (length > k) length = k; @@ -89,7 +89,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc a -= lda; } - + if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } diff --git a/driver/level2/tbmv_U.c b/driver/level2/tbmv_U.c index 49d28dcf5..50c10326b 100644 --- a/driver/level2/tbmv_U.c +++ b/driver/level2/tbmv_U.c @@ -56,14 +56,14 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc } for (i = 0; i < n; i++) { - + #ifndef TRANSA length = i; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, - B[i], + B[i], a + k - length, 1, B + i - length, 1, NULL, 0); } #endif @@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc B[i] *= a[0]; #endif #endif - + #ifdef TRANSA length = n - i - 1; if (length > k) length = k; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index e3d058826..3c1249448 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -105,18 +105,18 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F if (incx != 1) { COPY_K(args -> n, x, incx, buffer, 1); - + x = buffer; buffer += ((args -> n * COMPSIZE + 1023) & ~1023); - } + } if (range_n) y += *range_n * COMPSIZE; - SCAL_K(args -> n, 0, 0, ZERO, + SCAL_K(args -> n, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y, 1, NULL, 0, NULL, 0); + y, 1, NULL, 0, NULL, 0); for (i = n_from; i < n_to; i++) { @@ -148,7 +148,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #endif } #endif - + #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); @@ -183,19 +183,19 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #endif #endif #endif - + #ifdef LOWER if (length > 0) { #ifndef TRANS MYAXPY(length, 0, 0, - *(x + i * COMPSIZE + 0), + *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(length, a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); - + #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else @@ -205,10 +205,10 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #endif } #endif - + a += lda * COMPSIZE; } - + return 0; } @@ -236,7 +236,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -244,51 +244,51 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.n = n; args.k = k; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)(buffer); - + args.lda = lda; args.ldb = incx; - + dnum = (double)n * (double)n / (double)nthreads; num_cpu = 0; - + if (n < 2 * k) { #ifndef LOWER - + range_m[MAX_CPU_NUMBER] = n; i = 0; - + while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } - + if (width < 16) width = 16; if (width > n - i) width = n - i; - + } else { width = n - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -297,37 +297,37 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else - + range_m[0] = 0; i = 0; - + while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } - + if (width < 16) width = 16; if (width > n - i) width = n - i; - + } else { width = n - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -336,27 +336,27 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif } else { - + range_m[0] = 0; i = n; - + while (i > 0){ - + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); - + if (width < 4) width = 4; if (i < width) width = i; - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -365,7 +365,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i -= width; } @@ -376,20 +376,20 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + for (i = 1; i < num_cpu; i ++) { AXPYU_K(n, 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); } - + COPY_K(n, buffer, 1, x, incx); return 0; diff --git a/driver/level2/tbsv_L.c b/driver/level2/tbsv_L.c index e9c9158e4..0d036440d 100644 --- a/driver/level2/tbsv_L.c +++ b/driver/level2/tbsv_L.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc } for (i = 0; i < n; i++) { - + #ifdef TRANSA length = i; if (length > k) length = k; @@ -73,14 +73,14 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc B[i] /= a[0]; #endif #endif - + #ifndef TRANSA length = n - i - 1; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, - -B[i], + -B[i], a + 1, 1, B + i + 1, 1, NULL, 0); } #endif diff --git a/driver/level2/tbsv_U.c b/driver/level2/tbsv_U.c index 0b1fca8f0..1dc1a99e7 100644 --- a/driver/level2/tbsv_U.c +++ b/driver/level2/tbsv_U.c @@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } - + a += (n - 1) * lda; for (i = n - 1; i >= 0; i--) { @@ -75,21 +75,21 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc B[i] /= a[k]; #endif #endif - + #ifndef TRANSA length = i; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, - - B[i], + - B[i], a + k - length, 1, B + i - length, 1, NULL, 0); } #endif a -= lda; } - + if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } diff --git a/driver/level2/tpmv_L.c b/driver/level2/tpmv_L.c index c139eb79d..d01478c66 100644 --- a/driver/level2/tpmv_L.c +++ b/driver/level2/tpmv_L.c @@ -51,14 +51,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ B = buffer; COPY_K(m, b, incb, buffer, 1); } - + a += (m + 1) * m / 2 - 1; for (i = 0; i < m; i++) { #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, B[m - i - 1], a + 1, 1, B + m - i, 1, NULL, 0); #endif - + #ifndef UNIT B[m - i - 1] *= a[0]; #endif @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ a -= (m - i); #endif } - + if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } diff --git a/driver/level2/tpmv_U.c b/driver/level2/tpmv_U.c index 6d69df6f0..5d311f8bd 100644 --- a/driver/level2/tpmv_U.c +++ b/driver/level2/tpmv_U.c @@ -53,11 +53,11 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ } for (i = 0; i < m; i++) { - + #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, B[i], a, 1, B, 1, NULL, 0); #endif - + #ifndef UNIT #ifndef TRANSA B[i] *= a[i]; @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ B[i] *= a[0]; #endif #endif - + #ifdef TRANSA if (i < m - 1) B[i] += DOTU_K(m - i - 1, a + 1, 1, B + i + 1, 1); #endif diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index 64b725f86..3b91cee45 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -110,35 +110,35 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif - + x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); - } + } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER - SCAL_K(m_to, 0, 0, ZERO, + SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y, 1, NULL, 0, NULL, 0); + y, 1, NULL, 0, NULL, 0); #else - SCAL_K(args -> m - m_from, 0, 0, ZERO, + SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else - SCAL_K(m_to - m_from, 0, 0, ZERO, + SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif @@ -154,9 +154,9 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F if (i > 0) { #ifndef TRANS MYAXPY(i, 0, 0, - *(x + i * COMPSIZE + 0), + *(x + i * COMPSIZE + 0), #ifdef COMPLEX - *(x + i * COMPSIZE + 1), + *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); #else @@ -202,7 +202,7 @@ static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifdef LOWER if (args -> m > i + 1) { #ifndef TRANS - MYAXPY(args -> m - i - 1, 0, 0, + MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), @@ -258,7 +258,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -266,31 +266,31 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)(buffer); - + args.ldb = incx; args.ldc = incx; - + dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -300,14 +300,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].args = &args; @@ -316,20 +316,20 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -339,14 +339,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].args = &args; @@ -355,46 +355,46 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + #ifndef TRANS for (i = 1; i < num_cpu; i ++) { - + #ifndef LOWER - + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); - + #else - + AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); - + #endif } #endif - + COPY_K(m, buffer, 1, x, incx); return 0; diff --git a/driver/level2/tpsv_L.c b/driver/level2/tpsv_L.c index 9f76181e1..3fafa9054 100644 --- a/driver/level2/tpsv_L.c +++ b/driver/level2/tpsv_L.c @@ -41,7 +41,7 @@ #include "common.h" int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ - + BLASLONG i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifdef TRANSA if (i > 0) B[i] -= DOTU_K(i, a, 1, B, 1); #endif - + #ifndef UNIT #ifndef TRANSA B[i] /= a[0]; @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ B[i] /= a[i]; #endif #endif - + #ifndef TRANSA if (i < m - 1) { AXPYU_K(m - i - 1 , 0, 0, - B[i], @@ -78,7 +78,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ a += (i + 1); #endif } - + if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } diff --git a/driver/level2/tpsv_U.c b/driver/level2/tpsv_U.c index 7a0958021..fb5ef02b2 100644 --- a/driver/level2/tpsv_U.c +++ b/driver/level2/tpsv_U.c @@ -51,18 +51,18 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } - + a += (m + 1) * m / 2 - 1; for (i = 0; i < m; i++) { #ifdef TRANSA if (i > 0) B[m - i - 1] -= DOTU_K(i, a + 1, 1, B + m - i, 1); #endif - + #ifndef UNIT B[m - i - 1] /= a[0]; #endif - + #ifndef TRANSA if (i < m - 1) AXPYU_K(m - i - 1, 0, 0, -B[m - i - 1], a - (m - i - 1), 1, B, 1, NULL, 0); #endif diff --git a/driver/level2/trmv_L.c b/driver/level2/trmv_L.c index e515ba60b..0de48a697 100644 --- a/driver/level2/trmv_L.c +++ b/driver/level2/trmv_L.c @@ -53,14 +53,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } - + for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); - + #ifndef TRANSA if (m - is > 0){ - GEMV_N(m - is, min_i, 0, dp1, + GEMV_N(m - is, min_i, 0, dp1, a + is + (is - min_i) * lda, lda, B + is - min_i, 1, B + is, 1, gemvbuffer); @@ -83,10 +83,10 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu if (i < min_i - 1) BB[0] += DOTU_K(min_i - i - 1, AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1); #endif } - + #ifdef TRANSA if (is - min_i > 0){ - GEMV_T(is - min_i, min_i, 0, dp1, + GEMV_T(is - min_i, min_i, 0, dp1, a + (is - min_i) * lda, lda, B, 1, B + is - min_i, 1, gemvbuffer); diff --git a/driver/level2/trmv_U.c b/driver/level2/trmv_U.c index 3c36f77d9..a0aa7ef0e 100644 --- a/driver/level2/trmv_U.c +++ b/driver/level2/trmv_U.c @@ -55,12 +55,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu } for (is = 0; is < m; is += DTB_ENTRIES){ - + min_i = MIN(m - is, DTB_ENTRIES); #ifndef TRANSA if (is > 0){ - GEMV_N(is, min_i, 0, dp1, + GEMV_N(is, min_i, 0, dp1, a + is * lda, lda, B + is, 1, B, 1, gemvbuffer); @@ -70,7 +70,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu for (i = 0; i < min_i; i++) { FLOAT *AA = a + is + (i + is) * lda; FLOAT *BB = B + is; - + #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, BB[i], AA, 1, BB, 1, NULL, 0); #endif @@ -86,7 +86,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu #ifdef TRANSA if (m - is > min_i){ - GEMV_T(m - is - min_i, min_i, 0, dp1, + GEMV_T(m - is - min_i, min_i, 0, dp1, a + is + min_i + is * lda, lda, B + is + min_i, 1, B + is, 1, gemvbuffer); diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 4f5b27c69..29e9799f6 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -117,40 +117,40 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif - + x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); - } + } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER - SCAL_K(m_to, 0, 0, ZERO, + SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y, 1, NULL, 0, NULL, 0); + y, 1, NULL, 0, NULL, 0); #else - SCAL_K(args -> m - m_from, 0, 0, ZERO, + SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else - SCAL_K(m_to - m_from, 0, 0, ZERO, + SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif - y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif for (is = m_from; is < m_to; is += DTB_ENTRIES){ - + min_i = MIN(m_to - is, DTB_ENTRIES); #ifndef LOWER @@ -178,13 +178,13 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F if (i - is > 0) { #ifndef TRANS MYAXPY(i - is, 0, 0, - *(x + i * COMPSIZE + 0), + *(x + i * COMPSIZE + 0), #ifdef COMPLEX - *(x + i * COMPSIZE + 1), + *(x + i * COMPSIZE + 1), #endif a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0); #else - + result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1); #ifndef COMPLEX @@ -227,7 +227,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #ifdef LOWER if (is + min_i > i + 1) { #ifndef TRANS - MYAXPY(is + min_i - i - 1, 0, 0, + MYAXPY(is + min_i - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), @@ -248,7 +248,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F } #endif } - + #ifdef LOWER if (args -> m > is + min_i){ MYGEMV(args -> m - is - min_i, min_i, 0, @@ -259,9 +259,9 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a + (is + min_i + is * lda) * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, - y + (is + min_i) * COMPSIZE, 1, + y + (is + min_i) * COMPSIZE, 1, #else - x + (is + min_i) * COMPSIZE, 1, + x + (is + min_i) * COMPSIZE, 1, y + is * COMPSIZE, 1, #endif buffer); @@ -296,7 +296,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -304,32 +304,32 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif args.m = m; - + args.a = (void *)a; args.b = (void *)x; args.c = (void *)(buffer); - + args.lda = lda; args.ldb = incx; args.ldc = incx; - + dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; - + #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -339,14 +339,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -355,20 +355,20 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #else range_m[0] = 0; i = 0; - + while (i < m){ - + if (nthreads - num_cpu > 1) { - + double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; @@ -378,14 +378,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu if (width < 16) width = 16; if (width > m - i) width = m - i; - + } else { width = m - i; } - + range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -394,46 +394,46 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; - + queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + #ifndef TRANS for (i = 1; i < num_cpu; i ++) { - + #ifndef LOWER - + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); - + #else - + AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX - ZERO, + ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); - + #endif } #endif - + COPY_K(m, buffer, 1, x, incx); return 0; diff --git a/driver/level2/trsv_L.c b/driver/level2/trsv_L.c index 44bcfe398..95ec57213 100644 --- a/driver/level2/trsv_L.c +++ b/driver/level2/trsv_L.c @@ -46,7 +46,7 @@ const static FLOAT dm1 = -1.; #define GEMV_UNROLL DTB_ENTRIES int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ - + BLASLONG i, is, min_i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; @@ -58,14 +58,14 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf } for (is = 0; is < m; is += GEMV_UNROLL){ - + min_i = MIN(m - is, GEMV_UNROLL); #ifdef TRANSA if (is > 0){ - GEMV_T(is, min_i, 0, dm1, + GEMV_T(is, min_i, 0, dm1, a + is * lda , lda, - B, 1, + B, 1, B + is, 1, gemvbuffer); } #endif @@ -89,12 +89,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf } #endif } - + #ifndef TRANSA if (m - is > min_i){ - GEMV_N(m - is - min_i, min_i, 0, dm1, + GEMV_N(m - is - min_i, min_i, 0, dm1, a + is + min_i + is * lda, lda, - B + is, 1, + B + is, 1, B + (is + min_i), 1, gemvbuffer); } #endif diff --git a/driver/level2/trsv_U.c b/driver/level2/trsv_U.c index f02512bbb..823ca2e43 100644 --- a/driver/level2/trsv_U.c +++ b/driver/level2/trsv_U.c @@ -53,20 +53,20 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } - + for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #ifdef TRANSA if (m - is > 0){ - GEMV_T(m - is, min_i, 0, dm1, + GEMV_T(m - is, min_i, 0, dm1, a + is + (is - min_i) * lda, lda, B + is, 1, B + is - min_i, 1, gemvbuffer); } #endif - + for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; FLOAT *BB = B + (is - i - 1); @@ -86,13 +86,13 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf #ifndef TRANSA if (is - min_i > 0){ - GEMV_N(is - min_i, min_i, 0, dm1, + GEMV_N(is - min_i, min_i, 0, dm1, a + (is - min_i) * lda, lda, B + is - min_i, 1, B, 1, gemvbuffer); } #endif - + } if (incb != 1) { diff --git a/driver/level2/zgbmv_k.c b/driver/level2/zgbmv_k.c index 7832a7ea5..68d6045bd 100644 --- a/driver/level2/zgbmv_k.c +++ b/driver/level2/zgbmv_k.c @@ -129,7 +129,7 @@ void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOA Y[i * 2 + 1] += alpha_i * CREAL(temp) - alpha_r * CIMAG(temp); #endif #endif - + offset_u --; offset_l --; diff --git a/driver/level2/zhbmv_k.c b/driver/level2/zhbmv_k.c index 8771942d0..70e92e050 100644 --- a/driver/level2/zhbmv_k.c +++ b/driver/level2/zhbmv_k.c @@ -81,8 +81,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, if (length > 0) { AXPYU_K(length, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); } @@ -106,8 +106,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, if (length > 0) { AXPYU_K(length, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); } @@ -131,8 +131,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, if (length > 0) { AXPYC_K(length, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); } @@ -156,8 +156,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, if (length > 0) { AXPYC_K(length, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); } @@ -176,7 +176,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #endif #endif - + a += lda * 2; } diff --git a/driver/level2/zher2_k.c b/driver/level2/zher2_k.c index 3e924582f..94a8b7c61 100644 --- a/driver/level2/zher2_k.c +++ b/driver/level2/zher2_k.c @@ -41,7 +41,7 @@ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, - FLOAT *x, BLASLONG incx, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); diff --git a/driver/level2/zhpmv_k.c b/driver/level2/zhpmv_k.c index 5f95ce7bd..96bceaaf2 100644 --- a/driver/level2/zhpmv_k.c +++ b/driver/level2/zhpmv_k.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i; @@ -70,7 +70,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #ifndef LOWER if (i > 0) { FLOAT _Complex result = DOTC_K(i, a, 1, X, 1); - + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } @@ -83,18 +83,18 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, if (i > 0) { AXPYU_K(i, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; - + #else if (m - i > 1) { FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); - + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } @@ -107,8 +107,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, if (m - i > 1) { AXPYU_K(m - i - 1, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } @@ -119,7 +119,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #ifndef LOWER if (i > 0) { FLOAT _Complex result = DOTU_K(i, a, 1, X, 1); - + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } @@ -132,18 +132,18 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, if (i > 0) { AXPYC_K(i, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; - + #else if (m - i > 1) { FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); - + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } @@ -156,8 +156,8 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, if (m - i > 1) { AXPYC_K(m - i - 1, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } @@ -167,7 +167,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #endif } - + if (incy != 1) { COPY_K(m, Y, 1, y, incy); } diff --git a/driver/level2/zhpr2_k.c b/driver/level2/zhpr2_k.c index f4608ff9d..cb7113f60 100644 --- a/driver/level2/zhpr2_k.c +++ b/driver/level2/zhpr2_k.c @@ -41,7 +41,7 @@ #include "common.h" int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, - FLOAT *x, BLASLONG incx, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ BLASLONG i; @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, #endif #else #ifndef LOWER - AXPYC_K(i + 1, 0, 0, + AXPYC_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); diff --git a/driver/level2/zsbmv_k.c b/driver/level2/zsbmv_k.c index de5dfdde2..30e2f91c3 100644 --- a/driver/level2/zsbmv_k.c +++ b/driver/level2/zsbmv_k.c @@ -78,8 +78,8 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, length = k - offset; AXPYU_K(length + 1, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); if (length > 0) { @@ -95,18 +95,18 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, if (n - i - 1 < k) length = n - i - 1; AXPYU_K(length + 1, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y + i * COMPSIZE, 1, NULL, 0); if (length > 0) { FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); - + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } #endif - + a += lda * 2; } diff --git a/driver/level2/zspmv_k.c b/driver/level2/zspmv_k.c index c93b1e17e..76657eab9 100644 --- a/driver/level2/zspmv_k.c +++ b/driver/level2/zspmv_k.c @@ -69,29 +69,29 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, if (i > 0) { result = DOTU_K(i, a, 1, X, 1); - + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } AXPYU_K(i + 1, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); a += (i + 1) * 2; - + #else result = DOTU_K(m - i, a + i * 2, 1, X + i * 2, 1); - + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); - + if (m - i > 1) AXPYU_K(m - i - 1, 0, 0, - alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); a += (m - i - 1) * 2; diff --git a/driver/level2/zspr2_k.c b/driver/level2/zspr2_k.c index 48c81a366..e41a8de3c 100644 --- a/driver/level2/zspr2_k.c +++ b/driver/level2/zspr2_k.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ BLASLONG i; @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, for (i = 0; i < m; i++){ #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); diff --git a/driver/level2/zspr_k.c b/driver/level2/zspr_k.c index a187bdbfa..d888a81ee 100644 --- a/driver/level2/zspr_k.c +++ b/driver/level2/zspr_k.c @@ -54,7 +54,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, for (i = 0; i < m; i++){ #ifndef LOWER if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X, 1, a, 1, NULL, 0); diff --git a/driver/level2/zsyr2_k.c b/driver/level2/zsyr2_k.c index f7bbbb2f2..03daf923d 100644 --- a/driver/level2/zsyr2_k.c +++ b/driver/level2/zsyr2_k.c @@ -40,7 +40,7 @@ #include #include "common.h" -int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, for (i = 0; i < m; i++){ #ifndef LOWER - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); diff --git a/driver/level2/zsyr_k.c b/driver/level2/zsyr_k.c index 9d800d37d..57d1769c3 100644 --- a/driver/level2/zsyr_k.c +++ b/driver/level2/zsyr_k.c @@ -55,7 +55,7 @@ int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, for (i = 0; i < m; i++){ #ifndef LOWER if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { - AXPYU_K(i + 1, 0, 0, + AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X, 1, a, 1, NULL, 0); diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c index 9b604c04f..74ff0bce1 100644 --- a/driver/level2/ztbmv_L.c +++ b/driver/level2/ztbmv_L.c @@ -60,7 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } - + a += (n - 1) * lda * COMPSIZE; for (i = n - 1; i >= 0; i--) { @@ -102,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif - + #if (TRANSA == 2) || (TRANSA == 4) length = i; if (length > k) length = k; @@ -121,7 +121,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc a -= lda * COMPSIZE; } - + if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c index 4e86f4fb1..933275de3 100644 --- a/driver/level2/ztbmv_U.c +++ b/driver/level2/ztbmv_U.c @@ -62,7 +62,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc } for (i = 0; i < n; i++) { - + #if (TRANSA == 1) || (TRANSA == 3) length = i; if (length > k) length = k; diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c index f32ddff24..0726bbd16 100644 --- a/driver/level2/ztbsv_L.c +++ b/driver/level2/ztbsv_L.c @@ -62,7 +62,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc } for (i = 0; i < n; i++) { - + #if (TRANSA == 2) || (TRANSA == 4) length = i; if (length > k) length = k; @@ -87,11 +87,11 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc ar = a[k * 2 + 0]; ai = a[k * 2 + 1]; #endif - + if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); - + ar = den; #if TRANSA < 3 ai = -ratio * den; @@ -108,10 +108,10 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc ai = den; #endif } - + br = B[i * 2 + 0]; bi = B[i * 2 + 1]; - + B[i * 2 + 0] = ar*br - ai*bi; B[i * 2 + 1] = ar*bi + ai*br; #endif diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c index 252f3bace..d022650bc 100644 --- a/driver/level2/ztbsv_U.c +++ b/driver/level2/ztbsv_U.c @@ -60,7 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } - + a += (n - 1) * lda * COMPSIZE; for (i = n - 1; i >= 0; i--) { @@ -89,11 +89,11 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc ar = a[0]; ai = a[1]; #endif - + if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); - + ar = den; #if TRANSA < 3 ai = -ratio * den; @@ -110,10 +110,10 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc ai = den; #endif } - + br = B[i * 2 + 0]; bi = B[i * 2 + 1]; - + B[i * 2 + 0] = ar*br - ai*bi; B[i * 2 + 1] = ar*bi + ai*br; #endif @@ -138,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc a -= lda * COMPSIZE; } - + if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c index 62b9dc6ce..12c254c12 100644 --- a/driver/level2/ztpmv_L.c +++ b/driver/level2/ztpmv_L.c @@ -61,14 +61,14 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ a += (m + 1) * m - 2; for (i = 0; i < m; i++) { - + #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 - if (i > 0) AXPYU_K (i, 0, 0, + if (i > 0) AXPYU_K (i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #else - if (i > 0) AXPYC_K(i, 0, 0, + if (i > 0) AXPYC_K(i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #endif @@ -110,7 +110,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #endif } - + if (incb != 1) { COPY_K(m, buffer, 1, b, incb); diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c index 2ff3bfb56..59708b8b8 100644 --- a/driver/level2/ztpmv_U.c +++ b/driver/level2/ztpmv_U.c @@ -41,7 +41,7 @@ #include "common.h" int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ - + BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex temp; @@ -114,7 +114,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ a += (m - i) * 2; #endif } - + if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c index e9317fbdd..3b8e562ce 100644 --- a/driver/level2/ztpsv_L.c +++ b/driver/level2/ztpsv_L.c @@ -43,7 +43,7 @@ const static FLOAT dm1 = -1.; int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ - + BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex result; @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ } for (i = 0; i < m; i++) { - + #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #else result = DOTC_K(i, a, 1, B, 1); #endif - + B[i * COMPSIZE + 0] -= CREAL(result); B[i * COMPSIZE + 1] -= CIMAG(result); } @@ -83,11 +83,11 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ ar = a[i * COMPSIZE + 0]; ai = a[i * COMPSIZE + 1]; #endif - + if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); - + ar = den; #if TRANSA < 3 ai = -ratio * den; @@ -107,7 +107,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ br = B[i * COMPSIZE + 0]; bi = B[i * COMPSIZE + 1]; - + B[i * COMPSIZE + 0] = ar*br - ai*bi; B[i * COMPSIZE + 1] = ar*bi + ai*br; #endif diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c index 54903dc30..601ac2f9d 100644 --- a/driver/level2/ztpsv_U.c +++ b/driver/level2/ztpsv_U.c @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ a += (m + 1) * m - 2; for (i = 0; i < m; i++) { - + #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 @@ -69,20 +69,20 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #else result = DOTC_K(i, a + 2, 1, B + (m - i) * 2, 1); #endif - + B[(m - i - 1) * 2 + 0] -= CREAL(result); B[(m - i - 1) * 2 + 1] -= CIMAG(result); } #endif - + #ifndef UNIT ar = a[0]; ai = a[1]; - + if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); - + ar = den; #if (TRANSA == 1) || (TRANSA == 2) ai = -ratio * den; @@ -99,10 +99,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ ai = den; #endif } - + br = B[(m - i - 1) * 2 + 0]; bi = B[(m - i - 1) * 2 + 1]; - + B[(m - i - 1) * 2 + 0] = ar*br - ai*bi; B[(m - i - 1) * 2 + 1] = ar*bi + ai*br; #endif diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c index 3688f588e..63522cf81 100644 --- a/driver/level2/ztrmv_L.c +++ b/driver/level2/ztrmv_L.c @@ -122,7 +122,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu #endif } - + #if (TRANSA == 2) || (TRANSA == 4) if (is - min_i > 0){ #if TRANSA == 2 diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c index a9fb6d1d0..8a4494fd7 100644 --- a/driver/level2/ztrmv_U.c +++ b/driver/level2/ztrmv_U.c @@ -43,7 +43,7 @@ static FLOAT dp1 = 1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ - + BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex temp; @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu } for (is =0; is < m; is += DTB_ENTRIES){ - + min_i = MIN(m - is, DTB_ENTRIES); #if (TRANSA) == 1 || (TRANSA == 3) @@ -128,7 +128,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu #endif } - + #if (TRANSA) == 2 || (TRANSA == 4) if (m - is > min_i){ #if TRANSA == 2 diff --git a/driver/level2/ztrsv_L.c b/driver/level2/ztrsv_L.c index f825c61f5..90f1c2c7d 100644 --- a/driver/level2/ztrsv_L.c +++ b/driver/level2/ztrsv_L.c @@ -43,7 +43,7 @@ const static FLOAT dm1 = -1.; int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ - + BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex result; @@ -100,11 +100,11 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf #ifndef UNIT ar = AA[i * COMPSIZE + 0]; ai = AA[i * COMPSIZE + 1]; - + if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); - + ar = den; #if TRANSA < 3 ai = -ratio * den; @@ -124,7 +124,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf br = BB[i * COMPSIZE + 0]; bi = BB[i * COMPSIZE + 1]; - + BB[i * COMPSIZE + 0] = ar*br - ai*bi; BB[i * COMPSIZE + 1] = ar*bi + ai*br; #endif diff --git a/driver/level2/ztrsv_U.c b/driver/level2/ztrsv_U.c index 3b750a29f..bec8114f3 100644 --- a/driver/level2/ztrsv_U.c +++ b/driver/level2/ztrsv_U.c @@ -100,11 +100,11 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf #ifndef UNIT ar = AA[0]; ai = AA[1]; - + if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); - + ar = den; #if TRANSA < 3 ai = -ratio * den; @@ -124,7 +124,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buf br = BB[0]; bi = BB[1]; - + BB[0] = ar*br - ai*bi; BB[1] = ar*bi + ai*br; #endif diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 7d7d72339..4c004ee80 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -1094,7 +1094,7 @@ ssymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h ssymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + ssymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1106,7 +1106,7 @@ dsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h dsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + dsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1118,7 +1118,7 @@ qsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h qsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + qsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1130,7 +1130,7 @@ csymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h csymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1142,7 +1142,7 @@ zsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h zsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1154,7 +1154,7 @@ xsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h xsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1166,7 +1166,7 @@ ssymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h ssymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + ssymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1178,7 +1178,7 @@ dsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h dsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + dsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1190,7 +1190,7 @@ qsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h qsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + qsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1202,7 +1202,7 @@ csymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h csymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1214,7 +1214,7 @@ zsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h zsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1226,7 +1226,7 @@ xsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h xsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -1529,7 +1529,7 @@ chemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h chemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -1541,7 +1541,7 @@ zhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h zhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -1553,7 +1553,7 @@ xhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h xhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -1565,7 +1565,7 @@ chemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h chemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -1577,7 +1577,7 @@ zhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h zhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -1589,7 +1589,7 @@ xhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h xhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -1776,76 +1776,76 @@ xher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c xher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) -cgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) -cgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) -cgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) -cgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) -cgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) -cgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) -cgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) -cgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) -cgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) -cgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) -cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) -cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) -cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) -cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) -cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) -cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) -zgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) -zgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) -zgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) -zgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) -zgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) -zgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) -zgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) -zgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c @@ -2078,7 +2078,7 @@ csymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h csymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2090,7 +2090,7 @@ zsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h zsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2102,7 +2102,7 @@ xsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h xsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2114,7 +2114,7 @@ csymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h csymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2126,7 +2126,7 @@ zsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h zsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2138,7 +2138,7 @@ xsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h xsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2150,7 +2150,7 @@ chemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h chemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2162,7 +2162,7 @@ zhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h zhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2174,7 +2174,7 @@ xhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h xhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2186,7 +2186,7 @@ chemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h chemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2198,7 +2198,7 @@ zhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h zhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -2210,7 +2210,7 @@ xhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h xhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3463,7 +3463,7 @@ ssymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h ssymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + ssymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3475,7 +3475,7 @@ dsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h dsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + dsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3487,7 +3487,7 @@ qsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h qsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + qsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3499,7 +3499,7 @@ csymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h csymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3511,7 +3511,7 @@ zsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h zsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3523,7 +3523,7 @@ xsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h xsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3535,7 +3535,7 @@ ssymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h ssymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + ssymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3547,7 +3547,7 @@ dsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h dsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + dsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3559,7 +3559,7 @@ qsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h qsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + qsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3571,7 +3571,7 @@ csymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h csymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3583,7 +3583,7 @@ zsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h zsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3595,7 +3595,7 @@ xsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h xsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -3898,7 +3898,7 @@ chemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h chemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -3910,7 +3910,7 @@ zhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h zhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -3922,7 +3922,7 @@ xhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h xhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -3934,7 +3934,7 @@ chemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h chemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -3946,7 +3946,7 @@ zhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h zhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -3958,7 +3958,7 @@ xhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h xhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) @@ -4145,76 +4145,76 @@ xher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c xher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) -cgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) -cgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) -cgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) -cgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) -cgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) -cgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) -cgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) -cgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) -cgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) -cgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) -cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) -cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) -cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) -cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) -cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) -cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) -zgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) -zgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) -zgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) -zgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) -zgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) -zgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) -zgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) -zgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c +zgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) zgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c @@ -4447,7 +4447,7 @@ csymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h csymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4459,7 +4459,7 @@ zsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h zsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4471,7 +4471,7 @@ xsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h xsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4483,7 +4483,7 @@ csymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h csymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + csymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4495,7 +4495,7 @@ zsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h zsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4507,7 +4507,7 @@ xsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h xsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4519,7 +4519,7 @@ chemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h chemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4531,7 +4531,7 @@ zhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h zhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4543,7 +4543,7 @@ xhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h xhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4555,7 +4555,7 @@ chemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h chemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + chemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4567,7 +4567,7 @@ zhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h zhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + zhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) @@ -4579,7 +4579,7 @@ xhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h xhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) - + xhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c index df4d723ab..064968298 100644 --- a/driver/level3/gemm3m_level3.c +++ b/driver/level3/gemm3m_level3.c @@ -306,10 +306,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(js = n_from; js < n_to; js += GEMM3M_R){ min_j = n_to - js; if (min_j > GEMM3M_R) min_j = GEMM3M_R; - + for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; - + if (min_l >= GEMM3M_Q * 2) { min_l = GEMM3M_Q; } else { @@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif } } - + min_i = m_to - m_from; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; @@ -331,53 +331,53 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } START_RPCC(); - + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(innercost); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; - + START_RPCC(); - + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #else OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); #endif - + STOP_RPCC(outercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); - + STOP_RPCC(kernelcost); - - } - + + } + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; - } else + } else if (min_i > GEMM3M_P) { min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } - + START_RPCC(); - + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); } @@ -389,19 +389,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } } - + START_RPCC(); - + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(innercost); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; - + START_RPCC(); - + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) @@ -413,37 +413,37 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif STOP_RPCC(outercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); - + STOP_RPCC(kernelcost); - - } - + + } + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; - } else + } else if (min_i > GEMM3M_P) { min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } - + START_RPCC(); - + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); - + } min_i = m_to - m_from; @@ -454,20 +454,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } } - + START_RPCC(); - + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(innercost); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; - + START_RPCC(); - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); @@ -478,42 +478,42 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif STOP_RPCC(outercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); - + STOP_RPCC(kernelcost); - - } - + + } + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; - } else + } else if (min_i > GEMM3M_P) { min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } - + START_RPCC(); - + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); - + } } /* end of js */ } /* end of ls */ - + #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; @@ -526,6 +526,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100, 2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost); #endif - + return 0; } diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c index 52c9b2d3e..8813e5529 100644 --- a/driver/level3/gemm_thread_m.c +++ b/driver/level3/gemm_thread_m.c @@ -58,7 +58,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( num_cpu = 0; while (i > 0){ - + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; @@ -76,15 +76,15 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } - + if (num_cpu) { queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index b81c6fa40..2966eac82 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -40,7 +40,7 @@ #include #include "common.h" -static const int divide_rule[][2] = +static const int divide_rule[][2] = {{ 0, 0}, { 1, 1}, { 1, 2}, { 1, 3}, { 2, 2}, { 1, 5}, { 2, 3}, { 1, 7}, { 2, 4}, @@ -84,7 +84,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( num_cpu_m = 0; while (i > 0){ - + width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); i -= width; @@ -106,7 +106,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( num_cpu_n = 0; while (i > 0){ - + width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); i -= width; @@ -134,15 +134,15 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( procs ++; } } - + if (procs) { queue[0].sa = sa; queue[0].sb = sb; queue[procs - 1].next = NULL; - + exec_blas(procs, queue); } - + return 0; } diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index 3e11f9aba..9668841bb 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -54,11 +54,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( range[0] = range_n[0]; i = range_n[1] - range_n[0]; } - + num_cpu = 0; while (i > 0){ - + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; @@ -81,7 +81,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; } - + if (num_cpu) { #if 0 //defined(LOONGSON3A) queue[0].sa = sa; @@ -91,10 +91,10 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[0].sb = sb; #endif queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c index 9ffe17040..162a75f70 100644 --- a/driver/level3/gemm_thread_variable.c +++ b/driver/level3/gemm_thread_variable.c @@ -62,7 +62,7 @@ int CNAME(int mode, num_cpu_m = 0; while (i > 0){ - + width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); i -= width; @@ -84,7 +84,7 @@ int CNAME(int mode, num_cpu_n = 0; while (i > 0){ - + width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); i -= width; @@ -112,7 +112,7 @@ int CNAME(int mode, procs ++; } } - + if (procs) { queue[0].sa = sa; queue[0].sb = sb; @@ -121,7 +121,7 @@ int CNAME(int mode, exec_blas(procs, queue); } - + return 0; } diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 5f746642c..261204099 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -241,7 +241,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ) { #if defined(XDOUBLE) && defined(QUAD_PRECISION) xidouble xbeta; - + qtox(&xbeta, beta); #endif BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); @@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(js = n_from; js < n_to; js += GEMM_R){ min_j = n_to - js; if (min_j > GEMM_R) min_j = GEMM_R; - + for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; @@ -302,11 +302,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1)); while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; } - + /* First, we have to move data A to L2 cache */ min_i = m_to - m_from; l1stride = 1; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -316,13 +316,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, l1stride = 0; } } - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(innercost); - + #if defined(FUSED_GEMM) && !defined(TIMING) FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, @@ -344,16 +344,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif - + START_RPCC(); - - OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE * l1stride); - + STOP_RPCC(outercost); - + START_RPCC(); - + #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); @@ -363,39 +363,39 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif STOP_RPCC(kernelcost); - } + } #endif - + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); } - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); - + START_RPCC(); - + #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); #else KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); #endif - + STOP_RPCC(kernelcost); } /* end of is */ } /* end of js */ } /* end of ls */ - + #ifdef TIMING total = (double)outercost + (double)innercost + (double)kernelcost; diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index bcb0f9dd9..02bf57ee2 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -49,7 +49,7 @@ #endif //The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. +//Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif @@ -362,12 +362,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - + buffer[0] = sb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1)); } - + for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; if (min_l >= GEMM3M_Q * 2) { @@ -379,7 +379,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } min_i = m_to - m_from; - + if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; } else { @@ -390,73 +390,73 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, START_RPCC(); - + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(copy_A); - + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { - + START_RPCC(); - + /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; - + STOP_RPCC(waiting1); - + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; - + START_RPCC(); - + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #else OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #endif - + STOP_RPCC(copy_B); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, sa, buffer[bufferside] + min_l * (jjs - xxx), c, ldc, m_from, jjs); - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } - + for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } - + current = mypos; - + do { current ++; if (current >= args -> nthreads) current = 0; - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + if (current != mypos) { - + START_RPCC(); - + /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - + STOP_RPCC(waiting2); - + START_RPCC(); @@ -469,42 +469,42 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } - + if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } } while (current != mypos); - + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; - } else + } else if (min_i > GEMM3M_P) { min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } - + START_RPCC(); - + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(copy_A); - + current = mypos; do { - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + START_RPCC(); - + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; @@ -514,38 +514,38 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } - + current ++; if (current >= args -> nthreads) current = 0; - + } while (current != mypos); - + } /* end of is */ - + START_RPCC(); - + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(copy_A); - + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { - + START_RPCC(); - + /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; - + STOP_RPCC(waiting1); - + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; - + START_RPCC(); - + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) @@ -557,43 +557,43 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif STOP_RPCC(copy_B); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, sa, buffer[bufferside] + min_l * (jjs - xxx), c, ldc, m_from, jjs); - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } - + for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } - + current = mypos; - + do { current ++; if (current >= args -> nthreads) current = 0; - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + if (current != mypos) { - + START_RPCC(); - + /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - + STOP_RPCC(waiting2); - + START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, @@ -605,41 +605,41 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } - + if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } } while (current != mypos); - + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; - } else + } else if (min_i > GEMM3M_P) { min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } - + START_RPCC(); - + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(copy_A); - + current = mypos; do { - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; @@ -649,40 +649,40 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } - + current ++; if (current >= args -> nthreads) current = 0; - + } while (current != mypos); - + } /* end of is */ - + START_RPCC(); - + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(copy_A); - + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { - + START_RPCC(); - + /* Make sure if no one is using another buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; - + STOP_RPCC(waiting1); - + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; - + START_RPCC(); - -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); @@ -693,43 +693,43 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif STOP_RPCC(copy_B); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, sa, buffer[bufferside] + min_l * (jjs - xxx), c, ldc, m_from, jjs); - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * min_jj * min_l; #endif } - + for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } - + current = mypos; - + do { current ++; if (current >= args -> nthreads) current = 0; - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + if (current != mypos) { - + START_RPCC(); - + /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - + STOP_RPCC(waiting2); - + START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, @@ -741,41 +741,41 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } - + if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } } while (current != mypos); - + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM3M_P * 2) { min_i = GEMM3M_P; - } else + } else if (min_i > GEMM3M_P) { min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); } - + START_RPCC(); - + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(copy_A); - + current = mypos; do { - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + START_RPCC(); - + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; @@ -785,16 +785,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } - + current ++; if (current >= args -> nthreads) current = 0; - + } while (current != mypos); - + } /* end of is */ } - + START_RPCC(); for (i = 0; i < args -> nthreads; i++) { @@ -862,7 +862,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; #else mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; -#endif +#endif newarg.m = args -> m; newarg.n = args -> n; @@ -886,7 +886,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #endif newarg.common = (void *)job; - + if (!range_m) { range_M[0] = 0; m = args -> m; @@ -898,7 +898,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_cpu_m = 0; while (m > 0){ - + width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); m -= width; @@ -919,10 +919,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG queue[i].sb = NULL; queue[i].next = &queue[i + 1]; } - + queue[0].sa = sa; queue[0].sb = sb; - + if (!range_n) { n_from = 0; n_to = args -> n; @@ -934,23 +934,23 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG for(js = n_from; js < n_to; js += GEMM_R * nthreads){ n = n_to - js; if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; - + range_N[0] = js; num_cpu_n = 0; while (n > 0){ - + width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); - + n -= width; if (n < 0) width = width + n; - + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; - + num_cpu_n ++; } - + for (j = 0; j < num_cpu_m; j++) { for (i = 0; i < num_cpu_m; i++) { for (k = 0; k < DIVIDE_RATE; k++) { @@ -958,9 +958,9 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG } } } - + queue[num_cpu_m - 1].next = NULL; - + exec_blas(num_cpu_m, queue); } @@ -978,7 +978,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG nthreads = args -> nthreads; BLASLONG divN, divT; int mode; - + if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); @@ -1020,8 +1020,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif - +#endif + #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ defined(CN) || defined(CT) || defined(CR) || defined(CC) mode |= (BLAS_TRANSA_T); @@ -1030,8 +1030,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO defined(NC) || defined(TC) || defined(RC) || defined(CC) mode |= (BLAS_TRANSB_T); #endif - - gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); + + gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); } return 0; diff --git a/driver/level3/level3_syr2k.c b/driver/level3/level3_syr2k.c index 2db18578b..a75d379d7 100644 --- a/driver/level3/level3_syr2k.c +++ b/driver/level3/level3_syr2k.c @@ -178,16 +178,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_l = k - ls; if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; - } else + } else if (min_l > GEMM_Q) { min_l = (min_l + 1) / 2; } min_i = m_end - m_start; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } @@ -195,44 +195,44 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #ifndef LOWER if (m_start >= js) { - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); aa = sb + min_l * (m_start - js) * COMPSIZE; - + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); - + KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1); - + jjs = m_start + min_i; } else { - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); jjs = js; } - + for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; - + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1); } - + for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); @@ -243,50 +243,50 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } if (m_start >= js) { - + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); aa = sb + min_l * (m_start - js) * COMPSIZE; - + OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); - + KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0); - + jjs = m_start + min_i; } else { - + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); jjs = js; } - + for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; - + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0); } - + for(is = m_start + min_i; is < m_end; is += min_i){ min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); @@ -300,49 +300,49 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); - + KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1); for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ min_jj = m_start - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; - + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1); } for(is = m_start + min_i; is < m_end; is += min_i){ - + min_i = m_end - is; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + aa = sb + min_l * (is - js) * COMPSIZE; if (is < js + min_j) { - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa); - + KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 1); - + KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 1); - + } else { - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); - + } } @@ -351,7 +351,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } @@ -361,49 +361,49 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); - + KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0); for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ min_jj = m_start - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; - + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0); } for(is = m_start + min_i; is < m_end; is += min_i){ - + min_i = m_end - is; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + aa = sb + min_l * (is - js) * COMPSIZE; if (is < js + min_j) { - + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); - + OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa); - + KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 0); - + KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 0); - + } else { - + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); - + KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); - + } } diff --git a/driver/level3/level3_syrk.c b/driver/level3/level3_syrk.c index 249c140cd..ba544a00d 100644 --- a/driver/level3/level3_syrk.c +++ b/driver/level3/level3_syrk.c @@ -187,16 +187,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_l = k - ls; if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; - } else + } else if (min_l > GEMM_Q) { min_l = (min_l + 1) / 2; } min_i = m_end - m_start; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } @@ -207,29 +207,29 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO aa = sb + min_l * MAX(m_start - js, 0) * COMPSIZE; if (!shared) aa = sa; - + for(jjs = MAX(m_start, js); jjs < js + min_j; jjs += min_jj){ min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; - + if (!shared && (jjs - MAX(m_start, js) < min_i)) { START_RPCC(); - + ICOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sa + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(innercost); } - + START_RPCC(); - + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(outercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, aa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, MAX(m_start, js), jjs); - + STOP_RPCC(kernelcost); } @@ -237,30 +237,30 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m_end - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + aa = sb + min_l * (is - js) * COMPSIZE; - + if (!shared) { - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); aa = sa; } START_RPCC(); - + KERNEL_OPERATION(min_i, min_j, min_l, alpha, aa, sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); - + } } @@ -268,27 +268,27 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (m_start < js) { if (m_end < js) { - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); - + STOP_RPCC(innercost); - + for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; - + START_RPCC(); - + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(outercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); - + STOP_RPCC(kernelcost); } @@ -301,180 +301,180 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = MIN(m_end, js)- is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); - + } } #else if (m_start < js + min_j) { - + aa = sb + min_l * (m_start - js) * COMPSIZE; - + if (!shared) { START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); - + STOP_RPCC(innercost); - + } START_RPCC(); - + OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j + js - m_start)), a, lda, ls, m_start, aa); - + STOP_RPCC(outercost); START_RPCC(); - + KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, m_start, m_start); - + STOP_RPCC(kernelcost); for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_N){ min_jj = m_start - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + START_RPCC(); - + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(outercost); - + START_RPCC(); KERNEL_OPERATION(min_i, min_jj, min_l, alpha, (shared? (aa) : (sa)), sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); - + STOP_RPCC(kernelcost); - + } for(is = m_start + min_i; is < m_end; is += min_i){ - + min_i = m_end - is; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + if (is < js + min_j) { - + if (!shared) { START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); } aa = sb + min_l * (is - js) * COMPSIZE; - + START_RPCC(); - + OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j - is + js)), a, lda, ls, is, aa); - + STOP_RPCC(outercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, is, is); - + STOP_RPCC(kernelcost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, is - js, min_l, alpha, (shared? (aa) : (sa)), sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); - + } else { - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); - + } - + } } else { START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); - + STOP_RPCC(innercost); - + for(jjs = js; jjs < min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + START_RPCC(); - + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(outercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); - + STOP_RPCC(kernelcost); - + } - + for(is = m_start + min_i; is < m_end; is += min_i){ - + min_i = m_end - is; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(innercost); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); - + STOP_RPCC(kernelcost); - + } } #endif diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 4a3f7a89f..01c7b23ed 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -49,7 +49,7 @@ #endif //The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. +//Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif @@ -217,7 +217,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; } - + for(ls = 0; ls < k; ls += min_l){ min_l = k - ls; @@ -228,7 +228,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } min_i = m_to - m_from; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -244,22 +244,22 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif START_RPCC(); - + #ifndef LOWER ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); #else ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa); #endif - + STOP_RPCC(copy_A); - + div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); - + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { - + START_RPCC(); - + /* Make sure if no one is using buffer */ #ifndef LOWER for (i = 0; i < mypos; i++) @@ -267,9 +267,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = mypos + 1; i < args -> nthreads; i++) #endif while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; - + STOP_RPCC(waiting1); - + #ifndef LOWER for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ @@ -281,16 +281,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } else { if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; } - + START_RPCC(); - - OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); - + STOP_RPCC(copy_B); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, c, ldc, m_from, jjs); @@ -310,20 +310,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_jj = MIN(m_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; - + START_RPCC(); - - OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); - + STOP_RPCC(copy_B); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, c, ldc, m_to - min_i, jjs); - + STOP_RPCC(kernel); #ifdef TIMING @@ -333,7 +333,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - + #ifndef LOWER for (i = 0; i <= mypos; i++) #else @@ -344,7 +344,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, WMB; } - + #ifndef LOWER current = mypos + 1; while (current < args -> nthreads) { @@ -355,42 +355,42 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + START_RPCC(); - + /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - + STOP_RPCC(waiting2); - + START_RPCC(); - + #ifndef LOWER KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, ldc, + c, ldc, m_from, xxx); #else KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, ldc, + c, ldc, m_to - min_i, xxx); #endif - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif - + if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } - + #ifndef LOWER current ++; #else @@ -410,38 +410,38 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(copy_A); - + current = mypos; do { - + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); - + STOP_RPCC(kernel); #ifdef TIMING ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif - + #ifndef LOWER if (is + min_i >= m_to) { #else @@ -452,7 +452,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, WMB; } } - + #ifndef LOWER current ++; } while (current != args -> nthreads); @@ -460,11 +460,11 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, current --; } while (current >= 0); #endif - - + + } } - + START_RPCC(); for (i = 0; i < args -> nthreads; i++) { @@ -528,7 +528,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO double dnum; if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { - SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); + SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } @@ -542,7 +542,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -553,7 +553,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else mode = BLAS_SINGLE | BLAS_COMPLEX; mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; -#endif +#endif #endif newarg.m = args -> m; @@ -577,7 +577,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #endif newarg.common = (void *)job; - + if (!range_n) { n_from = 0; n_to = args -> n; @@ -597,17 +597,17 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO dnum = (double)n * (double)n /(double)nthreads; while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)i; - + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); - + if (num_cpu == 0) width = n - ((n - width) & ~mask); - + if ((width > n - i) || (width < mask)) width = n - i; - + } else { width = n - i; } @@ -622,7 +622,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } @@ -639,21 +639,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO dnum = (double)n * (double)n /(double)nthreads; while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)i; - + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); - + if ((width > n - i) || (width < mask)) width = n - i; - + } else { width = n - i; } range[num_cpu + 1] = range[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_thread; queue[num_cpu].args = &newarg; @@ -662,7 +662,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } @@ -680,14 +680,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } } } - + queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + #ifdef USE_ALLOC_HEAP free(job); #endif diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ee1a8db7c..95860d0c0 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -49,7 +49,7 @@ #endif //The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. +//Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif @@ -309,12 +309,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - + buffer[0] = sb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE; } - + for(ls = 0; ls < k; ls += min_l){ @@ -328,7 +328,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, l1stride = 1; min_i = m_to - m_from; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -340,23 +340,23 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - + STOP_RPCC(copy_A); - + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { - + START_RPCC(); - + /* Make sure if no one is using buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; - + STOP_RPCC(waiting1); - + #if defined(FUSED_GEMM) && !defined(TIMING) FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha, @@ -376,21 +376,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#endif +#endif START_RPCC(); - - OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride); - + STOP_RPCC(copy_B); - + START_RPCC(); - + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride, c, ldc, m_from, jjs); - + STOP_RPCC(kernel); #ifdef TIMING @@ -399,30 +399,30 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - + for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; WMB; } current = mypos; - + do { current ++; if (current >= args -> nthreads) current = 0; - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + if (current != mypos) { - + START_RPCC(); - + /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - + STOP_RPCC(waiting2); - + START_RPCC(); KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, @@ -434,43 +434,43 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; #endif } - + if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } } while (current != mypos); - + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); } - + START_RPCC(); - + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - + STOP_RPCC(copy_A); - + current = mypos; do { - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + START_RPCC(); - + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx); - + STOP_RPCC(kernel); #ifdef TIMING @@ -483,16 +483,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, WMB; } } - + current ++; if (current >= args -> nthreads) current = 0; - + } while (current != mypos); - + } - + } - + START_RPCC(); for (i = 0; i < args -> nthreads; i++) { @@ -561,7 +561,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; #else mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE; @@ -569,7 +569,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE; #else mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE; -#endif +#endif #endif newarg.m = args -> m; @@ -594,7 +594,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #endif newarg.common = (void *)job; - + #ifdef PARAMTEST newarg.gemm_p = args -> gemm_p; newarg.gemm_q = args -> gemm_q; @@ -612,7 +612,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_cpu_m = 0; while (m > 0){ - + width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); m -= width; @@ -633,10 +633,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG queue[i].sb = NULL; queue[i].next = &queue[i + 1]; } - + queue[0].sa = sa; queue[0].sb = sb; - + if (!range_n) { n_from = 0; n_to = args -> n; @@ -648,23 +648,23 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG for(js = n_from; js < n_to; js += GEMM_R * nthreads){ n = n_to - js; if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; - + range_N[0] = js; num_cpu_n = 0; while (n > 0){ - + width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); - + n -= width; if (n < 0) width = width + n; - + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; - + num_cpu_n ++; } - + for (j = 0; j < num_cpu_m; j++) { for (i = 0; i < num_cpu_m; i++) { for (k = 0; k < DIVIDE_RATE; k++) { @@ -672,7 +672,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG } } } - + queue[num_cpu_m - 1].next = NULL; exec_blas(num_cpu_m, queue); @@ -692,9 +692,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG nthreads = args -> nthreads; BLASLONG divN, divT; int mode; - + if (nthreads == 1) { - GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); + GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } @@ -745,7 +745,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -753,9 +753,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif #endif - +#endif + #if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ defined(CN) || defined(CT) || defined(CR) || defined(CC) mode |= (BLAS_TRANSA_T); @@ -764,11 +764,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO defined(NC) || defined(TC) || defined(RC) || defined(CC) mode |= (BLAS_TRANSB_T); #endif - + #ifdef OS_WINDOWS - gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN); + gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN); #else - gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); + gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); #endif } diff --git a/driver/level3/syr2k_k.c b/driver/level3/syr2k_k.c index 01251d483..8df0f122f 100644 --- a/driver/level3/syr2k_k.c +++ b/driver/level3/syr2k_k.c @@ -78,7 +78,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA #else - SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], + SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif diff --git a/driver/level3/syr2k_kernel.c b/driver/level3/syr2k_kernel.c index 8c476f50c..f9e4a4cda 100644 --- a/driver/level3/syr2k_kernel.c +++ b/driver/level3/syr2k_kernel.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -68,7 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #endif a, b + (m + offset) * k * COMPSIZE, - c + (m + offset) * ldc * COMPSIZE, ldc); + c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; @@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; @@ -134,53 +134,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #endif a + (n - offset) * k * COMPSIZE, b, - c + (n - offset) * COMPSIZE, ldc); + c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { - + int mm, nn; - + mm = (loop & ~(GEMM_UNROLL_MN - 1)); nn = MIN(GEMM_UNROLL_MN, n - loop); - + #ifndef LOWER GEMM_KERNEL_N(mm, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif - + if (flag) { - GEMM_BETA(nn, nn, 0, ZERO, + GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif NULL, 0, NULL, 0, subbuffer, nn); - + GEMM_KERNEL_N(nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif - a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); #ifndef LOWER - + for (j = 0; j < nn; j ++) { for (i = 0; i <= j; i ++) { #ifndef COMPLEX c[i + loop + (j + loop) * ldc] += subbuffer[i + j * nn] + subbuffer[j + i * nn]; #else - c[(i + loop + (j + loop) * ldc) * 2 + 0] += + c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; - c[(i + loop + (j + loop) * ldc) * 2 + 1] += + c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1]; #endif } @@ -189,7 +189,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, for (j = 0; j < nn; j ++) { for (i = j; i < nn; i ++) { #ifndef COMPLEX - c[i + loop + (j + loop) * ldc] += + c[i + loop + (j + loop) * ldc] += subbuffer[i + j * nn] + subbuffer[j + i * nn]; #else c[(i + loop + (j + loop) * ldc) * 2 + 0] += @@ -201,15 +201,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, } #endif } - + #ifdef LOWER GEMM_KERNEL_N(m - mm - nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif - a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, - c + (mm + nn + loop * ldc) * COMPSIZE, ldc); + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } diff --git a/driver/level3/syrk_k.c b/driver/level3/syrk_k.c index 9c9700ef3..08751dc8b 100644 --- a/driver/level3/syrk_k.c +++ b/driver/level3/syrk_k.c @@ -80,7 +80,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA #else - SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], + SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif diff --git a/driver/level3/syrk_kernel.c b/driver/level3/syrk_kernel.c index 65d108a49..434d2f630 100644 --- a/driver/level3/syrk_kernel.c +++ b/driver/level3/syrk_kernel.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -83,7 +83,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; @@ -114,7 +114,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #endif a, b + (m + offset) * k * COMPSIZE, - c + (m + offset) * ldc * COMPSIZE, ldc); + c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; @@ -128,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; @@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #endif a + (n - offset) * k * COMPSIZE, b, - c + (n - offset) * COMPSIZE, ldc); + c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; @@ -167,21 +167,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif - GEMM_BETA(nn, nn, 0, ZERO, + GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif NULL, 0, NULL, 0, subbuffer, nn); - + GEMM_KERNEL(nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif - a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); cc = c + (loop + loop * ldc) * COMPSIZE; ss = subbuffer; @@ -220,8 +220,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX alpha_i, #endif - a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, - c + (mm + nn + loop * ldc) * COMPSIZE, ldc); + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 837670b9f..0d9bdf209 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -52,7 +52,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( int num_cpu; int mask = 0; - + if (!(mode & BLAS_COMPLEX)) { switch (mode & BLAS_PREC) { @@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( #endif } } - + n_from = 0; n_to = arg -> n; @@ -96,29 +96,29 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( nf = (double)(n_from); nt = (double)(n_to); - + dnum = (nt * nt - nf * nf) / (double)nthreads; - + num_cpu = 0; - + range[0] = n_from; i = n_from; - + while (i < n_to){ - + if (nthreads - num_cpu > 1) { - + di = (double)i; width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; - + if ((width <= 0) || (width > n_to - i)) width = n_to - i; - + } else { width = n_to - i; } - + range[num_cpu + 1] = range[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = arg; @@ -127,38 +127,38 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } - + } else { nf = (double)(arg -> n - n_from); nt = (double)(arg -> n - n_to); dnum = (nt * nt - nf * nf) / (double)nthreads; - + num_cpu = 0; - + range[0] = n_from; i = n_from; - + while (i < n_to){ - + if (nthreads - num_cpu > 1) { - + di = (double)(arg -> n - i); width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; - + if ((width <= 0) || (width > n_to - i)) width = n_to - i; - + } else { width = n_to - i; } - + range[num_cpu + 1] = range[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = function; queue[num_cpu].args = arg; @@ -167,7 +167,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } @@ -178,9 +178,9 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + return 0; } diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 9e46df05c..c0a822b51 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -130,7 +130,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa); #endif - + STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ @@ -140,16 +140,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(outercost); - + START_RPCC(); - + TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0); + sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } @@ -158,7 +158,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(is = min_i; is < min_l; is += GEMM_P){ min_i = min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -166,16 +166,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa); #endif - + STOP_RPCC(innercost); - + START_RPCC(); - + TRMM_KERNEL_N(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is); STOP_RPCC(trmmcost); @@ -186,7 +186,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = ls; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -200,21 +200,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(gemmcost); - + START_RPCC(); - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb + min_l * (jjs - js) * COMPSIZE, - b + (jjs * ldb) * COMPSIZE, ldb); + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } @@ -222,7 +222,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(is = min_i; is < ls; is += GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -235,19 +235,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO START_RPCC(); - GEMM_KERNEL(min_i, min_j, min_l, dp1, + GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } - + for(is = ls; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -255,7 +255,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa); #endif - + STOP_RPCC(innercost); START_RPCC(); @@ -264,7 +264,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); STOP_RPCC(trmmcost); } @@ -275,7 +275,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -283,20 +283,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); #endif - + STOP_RPCC(innercost); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(outercost); - + START_RPCC(); TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, @@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, - b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0); + b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } @@ -312,7 +312,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(is = m - min_l + min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -320,16 +320,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa); #endif - + STOP_RPCC(innercost); - + START_RPCC(); TRMM_KERNEL_T(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l); STOP_RPCC(trmmcost); } @@ -339,7 +339,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -347,18 +347,18 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); #endif - + STOP_RPCC(innercost); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + START_RPCC(); GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); - + STOP_RPCC(outercost); START_RPCC(); @@ -368,7 +368,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, - b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0); + b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0); STOP_RPCC(trmmcost); } @@ -376,7 +376,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(is = ls - min_l + min_i; is < ls; is += GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -384,7 +384,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); #endif - + STOP_RPCC(innercost); START_RPCC(); @@ -393,7 +393,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l); STOP_RPCC(trmmcost); } @@ -402,7 +402,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(is = ls; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + START_RPCC(); #ifndef TRANSA @@ -415,11 +415,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO START_RPCC(); - GEMM_KERNEL(min_i, min_j, min_l, dp1, + GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); STOP_RPCC(gemmcost); } diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index e46553c3f..6012386c8 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -114,9 +114,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); - + for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -126,54 +126,54 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else GEMM_OTCOPY(min_l, min_jj, a + ((js + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif - - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, - b + ((js + jjs) * ldb) * COMPSIZE, ldb); + b + ((js + jjs) * ldb) * COMPSIZE, ldb); } for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + #ifndef TRANSA TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); #else TRMM_OUTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); #endif - + TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + (ls - js + jjs) * min_l * COMPSIZE, - b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); + b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - - GEMM_KERNEL(min_i, ls - js, min_l, dp1, + + GEMM_KERNEL(min_i, ls - js, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb, - b + (is + js * ldb) * COMPSIZE, ldb); - + b + (is + js * ldb) * COMPSIZE, ldb); + TRMM_KERNEL_T(min_i, min_l, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + (ls - js) * min_l * COMPSIZE, - b + (is + ls * ldb) * COMPSIZE, ldb, 0); + b + (is + ls * ldb) * COMPSIZE, ldb, 0); } } @@ -183,9 +183,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -195,26 +195,26 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif - - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb + min_l * (jjs - js) * COMPSIZE, - b + (jjs * ldb) * COMPSIZE, ldb); + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); } - + for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - - GEMM_KERNEL(min_i, min_j, min_l, dp1, + + GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } } @@ -225,7 +225,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; - + start_ls = js - min_j; while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; @@ -234,7 +234,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = 0; jjs < min_l; jjs += min_jj){ @@ -246,20 +246,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #else TRMM_OLTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); #endif - + TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, - b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); + b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); } - + for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); @@ -267,20 +267,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO GEMM_OTCOPY(min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #endif - - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_l + jjs) * COMPSIZE, - b + ((ls + min_l + jjs) * ldb) * COMPSIZE, ldb); + b + ((ls + min_l + jjs) * ldb) * COMPSIZE, ldb); } - + for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRMM_KERNEL_N(min_i, min_l, min_l, dp1, @@ -289,16 +289,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #endif sa, sb, - b + (is + ls * ldb) * COMPSIZE, ldb, 0); + b + (is + ls * ldb) * COMPSIZE, ldb, 0); if (js - ls - min_l > 0) { - GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1, + GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * min_l * COMPSIZE, - b + (is + (ls + min_l) * ldb) * COMPSIZE, ldb); + b + (is + (ls + min_l) * ldb) * COMPSIZE, ldb); } } } @@ -308,38 +308,38 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif - - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb + min_l * (jjs - js) * COMPSIZE, - b + ((jjs - min_j) * ldb) * COMPSIZE, ldb); + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + ((jjs - min_j) * ldb) * COMPSIZE, ldb); } - + for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - - GEMM_KERNEL(min_i, min_j, min_l, dp1, + + GEMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } } diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c index 2c3006f09..fa3b0d580 100644 --- a/driver/level3/trsm_L.c +++ b/driver/level3/trsm_L.c @@ -112,20 +112,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; - + #if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) for(ls = 0; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; - + #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #endif - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -136,43 +136,43 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #ifdef COMPLEX ZERO, #endif - sa, sb + min_l * (jjs - js) * COMPSIZE, + sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); } for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; - + #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); #endif - + TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); } - + for(is = ls + min_l; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif - - GEMM_KERNEL(min_i, min_j, min_l, dm1, + + GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #else @@ -197,19 +197,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); - + TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, sb + min_l * (jjs - js) * COMPSIZE, + sa, sb + min_l * (jjs - js) * COMPSIZE, b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); } - + for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; - + #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #else @@ -219,26 +219,26 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #ifdef COMPLEX ZERO, #endif - sa, sb, + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); } - + for(is = 0; is < ls - min_l; is += GEMM_P){ min_i = ls - min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; - + #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif - GEMM_KERNEL(min_i, min_j, min_l, dm1, + GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c index 0964d7860..b6ee95654 100644 --- a/driver/level3/trsm_R.c +++ b/driver/level3/trsm_R.c @@ -112,15 +112,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; - + for(ls = 0; ls < js; ls += GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -131,25 +131,25 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif - GEMM_KERNEL(min_i, min_jj, min_l, dm1, + GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, - b + (jjs * ldb) * COMPSIZE, ldb); + b + (jjs * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - - GEMM_KERNEL(min_i, min_j, min_l, dm1, + + GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } @@ -160,25 +160,25 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); - + #ifndef TRANSA TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #else TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #endif - + TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, - sb, + sb, b + (ls * ldb) * COMPSIZE, ldb, 0); - + for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ min_jj = min_j - min_l - ls + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); @@ -187,36 +187,36 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO sb + min_l * (min_l + jjs) * COMPSIZE); #endif - GEMM_KERNEL(min_i, min_jj, min_l, dm1, + GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, + sa, sb + min_l * (min_l + jjs) * COMPSIZE, - b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); + b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - + TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, - sb, + sb, b + (is + ls * ldb) * COMPSIZE, ldb, 0); - - GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, + + GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, + sa, sb + min_l * min_l * COMPSIZE, - b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); + b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); } } } @@ -235,48 +235,48 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); - + for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif - GEMM_KERNEL(min_i, min_jj, min_l, dm1, + GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, - b + (jjs - min_j) * ldb * COMPSIZE, ldb); + b + (jjs - min_j) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - - GEMM_KERNEL(min_i, min_j, min_l, dm1, + + GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } start_ls = js - min_j; while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; - + for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA @@ -286,63 +286,63 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #endif - + TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, - sb + min_l * (min_j - js + ls) * COMPSIZE, + sb + min_l * (min_j - js + ls) * COMPSIZE, b + (ls * ldb) * COMPSIZE, ldb, 0); - + for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ min_jj = min_j - js + ls - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else - GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, + GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif - - GEMM_KERNEL(min_i, min_jj, min_l, dm1, + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, + sa, sb + min_l * jjs * COMPSIZE, - b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); + b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - + TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, - sb + min_l * (min_j - js + ls) * COMPSIZE, + sb + min_l * (min_j - js + ls) * COMPSIZE, b + (is + ls * ldb) * COMPSIZE, ldb, 0); - - GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, + + GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif - sa, + sa, sb, - b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } } - + #endif - + return 0; } diff --git a/driver/level3/zher2k_k.c b/driver/level3/zher2k_k.c index 93bb781f1..54c76d7f5 100644 --- a/driver/level3/zher2k_k.c +++ b/driver/level3/zher2k_k.c @@ -130,7 +130,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); - if (i + n_from - m_from + 1 <= m_to) + if (i + n_from - m_from + 1 <= m_to) *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; c += ldc * COMPSIZE; diff --git a/driver/level3/zher2k_kernel.c b/driver/level3/zher2k_kernel.c index 9b4c45033..92aef8880 100644 --- a/driver/level3/zher2k_kernel.c +++ b/driver/level3/zher2k_kernel.c @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #endif a, b + (m + offset) * k * COMPSIZE, - c + (m + offset) * ldc * COMPSIZE, ldc); + c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; @@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #ifdef COMPLEX alpha_i, #endif - a, b, c, ldc); + a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; @@ -139,30 +139,30 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #endif a + (n - offset) * k * COMPSIZE, b, - c + (n - offset) * COMPSIZE, ldc); + c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { - + int mm, nn; - + mm = (loop & ~(GEMM_UNROLL_MN - 1)); nn = MIN(GEMM_UNROLL_MN, n - loop); - + #ifndef LOWER GEMM_KERNEL(mm, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif - a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif - + if (flag) { - GEMM_BETA(nn, nn, 0, ZERO, + GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif @@ -173,17 +173,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, #ifdef COMPLEX alpha_i, #endif - a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); #ifndef LOWER - + for (j = 0; j < nn; j ++) { for (i = 0; i <= j; i ++) { - c[(i + loop + (j + loop) * ldc) * 2 + 0] += + c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; if (i != j) { - c[(i + loop + (j + loop) * ldc) * 2 + 1] += + c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; } else { c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; @@ -205,15 +205,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, } #endif } - + #ifdef LOWER GEMM_KERNEL(m - mm - nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif - a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, - c + (mm + nn + loop * ldc) * COMPSIZE, ldc); + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } diff --git a/driver/level3/zherk_k.c b/driver/level3/zherk_k.c index d1ffbdb12..2203fc5c1 100644 --- a/driver/level3/zherk_k.c +++ b/driver/level3/zherk_k.c @@ -128,7 +128,7 @@ static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLA SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); - if (i + n_from - m_from + 1 <= m_to) + if (i + n_from - m_from + 1 <= m_to) *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; c += ldc * COMPSIZE; diff --git a/driver/level3/zherk_kernel.c b/driver/level3/zherk_kernel.c index fd8ff9cf3..e4c9e27c4 100644 --- a/driver/level3/zherk_kernel.c +++ b/driver/level3/zherk_kernel.c @@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifndef LOWER GEMM_KERNEL(m, n, k, alpha_r, ZERO, - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -68,7 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef LOWER GEMM_KERNEL(m, n, k, alpha_r, ZERO, - a, b, c, ldc); + a, b, c, ldc); #endif return 0; } @@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef LOWER GEMM_KERNEL(m, offset, k, alpha_r, ZERO, - a, b, c, ldc); + a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; @@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, alpha_r, ZERO, a, b + (m + offset) * k * COMPSIZE, - c + (m + offset) * ldc * COMPSIZE, ldc); + c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; @@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifndef LOWER GEMM_KERNEL(-offset, n, k, alpha_r, ZERO, - a, b, c, ldc); + a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; @@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, alpha_r, ZERO, a + (n - offset) * k * COMPSIZE, b, - c + (n - offset) * COMPSIZE, ldc); + c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; @@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifndef LOWER GEMM_KERNEL(mm, nn, k, alpha_r, ZERO, - a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif GEMM_BETA(nn, nn, 0, ZERO, ZERO, @@ -146,8 +146,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, GEMM_KERNEL(nn, nn, k, alpha_r, ZERO, - a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); - + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + cc = c + (loop + loop * ldc) * COMPSIZE; ss = subbuffer; @@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, cc[i * 2 + 0] += ss[i * 2 + 0]; cc[i * 2 + 1] += ss[i * 2 + 1]; } - + cc[j * 2 + 0] += ss[i * 2 + 0]; cc[j * 2 + 1] = ZERO; @@ -184,8 +184,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef LOWER GEMM_KERNEL(m - mm - nn, nn, k, alpha_r, ZERO, - a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, - c + (mm + nn + loop * ldc) * COMPSIZE, ldc); + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } diff --git a/driver/level3/zsyrk_beta.c b/driver/level3/zsyrk_beta.c index eb0972975..3787e31b5 100644 --- a/driver/level3/zsyrk_beta.c +++ b/driver/level3/zsyrk_beta.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i, FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6, - FLOAT *c, BLASLONG ldc, + FLOAT *c, BLASLONG ldc, FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){ BLASLONG i; diff --git a/driver/mapper/mapper.c b/driver/mapper/mapper.c index 83805fb1e..bbf499fce 100644 --- a/driver/mapper/mapper.c +++ b/driver/mapper/mapper.c @@ -92,7 +92,7 @@ static int mapper_release(struct inode *inode, struct file *fp){ #ifdef CONFIG_BIGPHYS_AREA bigphysarea_free_pages(buffer[pos].address); #else - + for (addr = buffer[pos].address; addr < buffer[pos].address + buffer[pos].size; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); } @@ -121,7 +121,7 @@ int mapper_mapper(struct file *fp, struct vm_area_struct *vma){ all_length = vma->vm_end - vma->vm_start; current_addr = vma -> vm_start; - + spin_lock(&lock); while (all_length > 0) { @@ -133,56 +133,56 @@ int mapper_mapper(struct file *fp, struct vm_area_struct *vma){ pos = 0; while ((pos < MAX_BUFF_SIZE) && (buffer[pos].address != 0)) pos ++; - + if (pos >= MAX_BUFF_SIZE) { - + printk("Memory Allocator : too much memory allocation requested.\n"); spin_unlock(&lock); - + return -EIO; } - + #ifdef CONFIG_BIGPHYS_AREA alloc_addr = (caddr_t)bigphysarea_alloc_pages(length >> PAGE_SHIFT, 1, GFP_KERNEL); #else alloc_addr = (caddr_t)kmalloc(length, GFP_KERNEL); #endif - + if (alloc_addr == (caddr_t)NULL) { - + spin_unlock(&lock); - + return -EIO; } - + #ifndef CONFIG_BIGPHYS_AREA for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) { clear_page(addr); SetPageReserved(virt_to_page(addr)); } #endif - + if ((ret = remap_pfn_range(vma, current_addr, virt_to_phys((void *)alloc_addr) >> PAGE_SHIFT, length, PAGE_SHARED)) < 0) { - + #ifdef CONFIG_BIGPHYS_AREA bigphysarea_free_pages((caddr_t)alloc_addr); #else - + for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) ClearPageReserved(virt_to_page(addr)); - + kfree((caddr_t)alloc_addr); #endif - + spin_unlock(&lock); - + return ret; } - + buffer[pos].pid = current -> tgid; buffer[pos].address = alloc_addr; #ifndef CONFIG_BIGPHYS_AREA @@ -209,11 +209,11 @@ static int __init mapper_init(void){ int ret, i; ret = alloc_chrdev_region(&mapper_dev, 0, 1, "mapper"); - + cdev_init(&mapper_cdev, &mapper_fops); ret = cdev_add(&mapper_cdev, mapper_dev, 1); - + spin_lock_init(&lock); for (i = 0; i < MAX_BUFF_SIZE; i++) { @@ -240,7 +240,7 @@ static void __exit mapper_exit(void){ #endif } } - + cdev_del(&mapper_cdev); unregister_chrdev_region(mapper_dev, 1); diff --git a/driver/others/Makefile b/driver/others/Makefile index ca05c5129..fc73871cc 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,14 +1,14 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) ifndef NO_AFFINITY -COMMONOBJS += init.$(SUFFIX) +COMMONOBJS += init.$(SUFFIX) endif endif @@ -56,13 +56,13 @@ ifeq ($(USE_OPENMP), 1) BLAS_SERVER = blas_server_omp.c else ifeq ($(OSNAME), WINNT) -BLAS_SERVER = blas_server_win32.c +BLAS_SERVER = blas_server_win32.c endif ifeq ($(OSNAME), CYGWIN_NT) -BLAS_SERVER = blas_server_win32.c +BLAS_SERVER = blas_server_win32.c endif ifeq ($(OSNAME), Interix) -BLAS_SERVER = blas_server_win32.c +BLAS_SERVER = blas_server_win32.c endif endif diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index 851135b10..83fc26884 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -42,9 +42,9 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, - void *b, BLASLONG ldb, + void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads){ - + blas_queue_t queue[MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER]; @@ -52,23 +52,23 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha int num_cpu, calc_type; calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; - + mode |= BLAS_LEGACY; for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); num_cpu = 0; i = m; - + while (i > 0){ - + /* Adjust Parameters */ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); i -= width; if (i < 0) width = width + i; - + astride = width * lda; if (!(mode & BLAS_TRANSB_T)) { @@ -95,10 +95,10 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha queue[num_cpu].routine = function; queue[num_cpu].args = &args[num_cpu]; queue[num_cpu].next = &queue[num_cpu + 1]; - + a = (void *)((BLASULONG)a + astride); b = (void *)((BLASULONG)b + bstride); - + num_cpu ++; } diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 1735ee931..95b5965e1 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -178,8 +178,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* REAL / Extended Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -187,14 +187,14 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else + } else #endif if (mode & BLAS_DOUBLE){ /* REAL / Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, - double *, BLASLONG, double *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; - + afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], args -> a, args -> lda, @@ -202,10 +202,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else { /* REAL / Single */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, - float *, BLASLONG, float *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; - + afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], args -> a, args -> lda, @@ -217,7 +217,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (mode & BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -231,7 +231,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (mode & BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, - double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -243,7 +243,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } else { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, - float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -274,11 +274,11 @@ static int blas_thread_server(void *arg){ #ifdef TIMING_DEBUG unsigned long start, stop; #endif - + #if defined(OS_LINUX) && !defined(NO_AFFINITY) if (!increased_threads) thread_status[cpu].node = gotoblas_set_affinity(cpu + 1); - else + else thread_status[cpu].node = gotoblas_set_affinity(-1); #endif @@ -291,7 +291,7 @@ static int blas_thread_server(void *arg){ #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu); #endif - + while (1){ #ifdef MONITOR @@ -303,34 +303,34 @@ static int blas_thread_server(void *arg){ #endif last_tick = (unsigned int)rpcc(); - + while (!thread_status[cpu].queue) { - + YIELDING; if ((unsigned int)rpcc() - last_tick > thread_timeout) { - + pthread_mutex_lock (&thread_status[cpu].lock); - + if (!thread_status[cpu].queue) { thread_status[cpu].status = THREAD_STATUS_SLEEP; while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { - + #ifdef MONITOR main_status[cpu] = MAIN_SLEEPING; #endif - + pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); } } - + pthread_mutex_unlock(&thread_status[cpu].lock); - + last_tick = (unsigned int)rpcc(); } - + } - + queue = thread_status[cpu].queue; if ((long)queue == -1) break; @@ -345,19 +345,19 @@ static int blas_thread_server(void *arg){ if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - + thread_status[cpu].queue = (blas_queue_t *)1; sa = queue -> sa; sb = queue -> sb; - + #ifdef SMP_DEBUG if (queue -> args) { fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); } #endif - + #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); @@ -366,21 +366,21 @@ static int blas_thread_server(void *arg){ #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; #endif - + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else + } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + } else { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); @@ -402,7 +402,7 @@ static int blas_thread_server(void *arg){ } queue->sb=sb; } - + #ifdef MONITOR main_status[cpu] = MAIN_RUNNING2; #endif @@ -423,24 +423,24 @@ static int blas_thread_server(void *arg){ #ifdef MONITOR main_status[cpu] = MAIN_FINISH; #endif - + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ WMB; } - + #ifdef MONITOR main_status[cpu] = MAIN_DONE; #endif #ifdef TIMING_DEBUG stop = rpcc(); - + fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1, start, stop, stop - start); #endif - + } /* Shutdown procedure */ @@ -508,7 +508,7 @@ static int blas_monitor(void *arg){ } sleep(1); } - + return 0; } #endif @@ -522,13 +522,13 @@ int blas_thread_init(void){ #endif if (blas_server_avail) return 0; - + #ifdef NEED_STACKATTR pthread_attr_init(&attr); pthread_attr_setguardsize(&attr, 0x1000U); pthread_attr_setstacksize( &attr, 0x1000U); #endif - + LOCK_COMMAND(&server_lock); if (!blas_server_avail){ @@ -551,21 +551,21 @@ int blas_thread_init(void){ thread_timeout = (1 << thread_timeout); } } - + for(i = 0; i < blas_num_threads - 1; i++){ thread_status[i].queue = (blas_queue_t *)NULL; thread_status[i].status = THREAD_STATUS_WAKEUP; - + pthread_mutex_init(&thread_status[i].lock, NULL); pthread_cond_init (&thread_status[i].wakeup, NULL); - + #ifdef NEED_STACKATTR - ret=pthread_create(&blas_threads[i], &attr, + ret=pthread_create(&blas_threads[i], &attr, (void *)&blas_thread_server, (void *)i); #else - ret=pthread_create(&blas_threads[i], NULL, + ret=pthread_create(&blas_threads[i], NULL, (void *)&blas_thread_server, (void *)i); #endif if(ret!=0){ @@ -575,7 +575,7 @@ int blas_thread_init(void){ } #ifdef MONITOR - pthread_create(&monitor_thread, NULL, + pthread_create(&monitor_thread, NULL, (void *)&blas_monitor, (void *)NULL); #endif @@ -587,7 +587,7 @@ int blas_thread_init(void){ return 0; } -/* +/* User can call one of two routines. exec_blas_async ... immediately returns after jobs are queued. @@ -613,13 +613,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #ifdef SMP_DEBUG int exec_count = 0; fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos); -#endif - +#endif + blas_lock(&exec_queue_lock); while (queue) { queue -> position = pos; - + #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); @@ -633,7 +633,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ do { while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; - + if (i < blas_num_threads - 1) break; i ++; @@ -657,40 +657,40 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ if (i >= blas_num_threads - 1) i = 0; } #endif - + queue -> assigned = i; WMB; thread_status[i].queue = queue; WMB; - + queue = queue -> next; pos ++; #ifdef SMP_DEBUG exec_count ++; #endif - + } blas_unlock(&exec_queue_lock); #ifdef SMP_DEBUG fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count); -#endif - +#endif + while (current) { - + pos = current -> assigned; - + if ((BLASULONG)thread_status[pos].queue > 1) { - + if (thread_status[pos].status == THREAD_STATUS_SLEEP) { - + pthread_mutex_lock (&thread_status[pos].lock); - + #ifdef MONITOR num_suspend ++; #endif - + if (thread_status[pos].status == THREAD_STATUS_SLEEP) { thread_status[pos].status = THREAD_STATUS_WAKEUP; pthread_cond_signal(&thread_status[pos].wakeup); @@ -698,7 +698,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ pthread_mutex_unlock(&thread_status[pos].lock); } } - + current = current -> next; } @@ -708,11 +708,11 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ while ((num > 0) && queue) { - + while(thread_status[queue -> assigned].queue) { YIELDING; }; - + queue = queue -> next; num --; } @@ -720,7 +720,7 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ #ifdef SMP_DEBUG fprintf(STDERR, "Done.\n\n"); #endif - + return 0; } @@ -738,31 +738,31 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ #endif if ((num <= 0) || (queue == NULL)) return 0; - + #ifdef SMP_DEBUG fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); -#endif +#endif #ifdef __ELF__ if (omp_in_parallel && (num > 1)) { if (omp_in_parallel() > 0) { - fprintf(stderr, + fprintf(stderr, "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. " "Please rebuild the library with USE_OPENMP=1 option.\n"); } } #endif - + if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); #ifdef TIMING_DEBUG start = rpcc(); - + fprintf(STDERR, "\n"); #endif - + routine = queue -> routine; - + if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); } else @@ -772,19 +772,19 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ } else (routine)(queue -> args, queue -> range_m, queue -> range_n, queue -> sa, queue -> sb, 0); - + #ifdef TIMING_DEBUG stop = rpcc(); #endif - + if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); - + #ifdef TIMING_DEBUG - fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", + fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", start, stop, stop - start); #endif - + return 0; } @@ -798,7 +798,7 @@ void goto_set_num_threads(int num_threads) { if (num_threads == 1) { if (blas_cpu_number == 1){ //OpenBLAS is already single thread. - return; + return; }else{ //From multi-threads to single thread //Restore the original affinity mask @@ -812,26 +812,26 @@ void goto_set_num_threads(int num_threads) { if (num_threads > blas_num_threads) { LOCK_COMMAND(&server_lock); - + increased_threads = 1; for(i = blas_num_threads - 1; i < num_threads - 1; i++){ - + thread_status[i].queue = (blas_queue_t *)NULL; thread_status[i].status = THREAD_STATUS_WAKEUP; - + pthread_mutex_init(&thread_status[i].lock, NULL); pthread_cond_init (&thread_status[i].wakeup, NULL); - + #ifdef NEED_STACKATTR - pthread_create(&blas_threads[i], &attr, + pthread_create(&blas_threads[i], &attr, (void *)&blas_thread_server, (void *)i); #else - pthread_create(&blas_threads[i], NULL, + pthread_create(&blas_threads[i], NULL, (void *)&blas_thread_server, (void *)i); #endif } - + blas_num_threads = num_threads; UNLOCK_COMMAND(&server_lock); @@ -846,7 +846,7 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; -#if defined(ARCH_MIPS64) +#if defined(ARCH_MIPS64) //set parameters for different number of threads. blas_set_parameter(); #endif @@ -855,7 +855,7 @@ void goto_set_num_threads(int num_threads) { void openblas_set_num_threads(int num_threads) { goto_set_num_threads(num_threads); - + } /* Compatible function with pthread_create / join */ @@ -887,11 +887,11 @@ int gotoblas_pthread(int numthreads, void *function, void *args, int stride) { args += stride; } - + queue[numthreads - 1].next = NULL; - + exec_blas(numthreads, queue); - + return 0; } @@ -903,17 +903,17 @@ int BLASFUNC(blas_thread_shutdown)(void){ int i; if (!blas_server_avail) return 0; - + LOCK_COMMAND(&server_lock); for (i = 0; i < blas_num_threads - 1; i++) { blas_lock(&exec_queue_lock); - + thread_status[i].queue = (blas_queue_t *)-1; blas_unlock(&exec_queue_lock); - + pthread_mutex_lock (&thread_status[i].lock); thread_status[i].status = THREAD_STATUS_WAKEUP; @@ -931,16 +931,16 @@ int BLASFUNC(blas_thread_shutdown)(void){ for(i = 0; i < blas_num_threads - 1; i++){ pthread_mutex_destroy(&thread_status[i].lock); pthread_cond_destroy (&thread_status[i].wakeup); - } + } #ifdef NEED_STACKATTR pthread_attr_destory(&attr); #endif blas_server_avail = 0; - + UNLOCK_COMMAND(&server_lock); - + return 0; } diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 0a484f3e4..8d62a8125 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -79,7 +79,7 @@ void goto_set_num_threads(int num_threads) { blas_thread_buffer[i]=NULL; } } -#if defined(ARCH_MIPS64) +#if defined(ARCH_MIPS64) //set parameters for different number of threads. blas_set_parameter(); #endif @@ -128,8 +128,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* REAL / Extended Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -137,14 +137,14 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else + } else #endif if (mode & BLAS_DOUBLE){ /* REAL / Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, - double *, BLASLONG, double *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; - + afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], args -> a, args -> lda, @@ -152,10 +152,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else { /* REAL / Single */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, - float *, BLASLONG, float *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; - + afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], args -> a, args -> lda, @@ -167,7 +167,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (mode & BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -181,7 +181,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (mode & BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, - double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -193,7 +193,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } else { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, - float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -210,7 +210,7 @@ static void exec_threads(blas_queue_t *queue){ void *buffer, *sa, *sb; int pos=0, release_flag=0; - + buffer = NULL; sa = queue -> sa; sb = queue -> sb; @@ -235,19 +235,19 @@ static void exec_threads(blas_queue_t *queue){ sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); queue->sa=sa; } - + if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else + } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + } else { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 100ca34f7..081bdd7d4 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -71,8 +71,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ #ifdef EXPRECISION if (mode & BLAS_XDOUBLE){ /* REAL / Extended Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -80,14 +80,14 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else + } else #endif if (mode & BLAS_DOUBLE){ /* REAL / Double */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, - double *, BLASLONG, double *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; - + afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], args -> a, args -> lda, @@ -95,10 +95,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else { /* REAL / Single */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, - float *, BLASLONG, float *, BLASLONG, + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; - + afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], args -> a, args -> lda, @@ -110,7 +110,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (mode & BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, - xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -124,7 +124,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (mode & BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, - double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -136,7 +136,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ } else { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, - float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *) = func; afunc(args -> m, args -> n, args -> k, @@ -163,47 +163,47 @@ static DWORD WINAPI blas_thread_server(void *arg){ blas_queue_t *queue; DWORD action; HANDLE handles[] = {pool.filled, pool.killed}; - + /* Each server needs each buffer */ buffer = blas_memory_alloc(2); - + #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); #endif - + while (1){ - + /* Waiting for Queue */ - + #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); #endif - + do { action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); - + if (action == WAIT_OBJECT_0 + 1) break; #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); #endif - + EnterCriticalSection(&pool.lock); - + queue = pool.queue; if (queue) pool.queue = queue->next; - + LeaveCriticalSection(&pool.lock); - + if (queue) { int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; - + if (pool.queue) SetEvent(pool.filled); - + sa = queue -> sa; sb = queue -> sb; - + #ifdef CONSISTENT_FPCSR __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); @@ -213,27 +213,27 @@ static DWORD WINAPI blas_thread_server(void *arg){ fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); #endif - + // fprintf(stderr, "queue start[%ld]!!!\n", cpu); - + #ifdef MONITOR main_status[cpu] = MAIN_RUNNING1; #endif - + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); - + if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION if (queue -> mode & BLAS_XDOUBLE){ - sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else + } else #endif if (queue -> mode & BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + } else { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); @@ -255,11 +255,11 @@ static DWORD WINAPI blas_thread_server(void *arg){ } queue->sb=sb; } - + #ifdef MONITOR main_status[cpu] = MAIN_RUNNING2; #endif - + if (!(queue -> mode & BLAS_LEGACY)) { (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); @@ -269,28 +269,28 @@ static DWORD WINAPI blas_thread_server(void *arg){ }else{ continue; //if queue == NULL } - + #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); #endif - + EnterCriticalSection(&queue->lock); - + queue -> status = BLAS_STATUS_FINISHED; - + LeaveCriticalSection(&queue->lock); - + SetEvent(queue->finish); } - + /* Shutdown procedure */ - + #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); #endif - + blas_memory_free(buffer); - + return 0; } @@ -299,11 +299,11 @@ int blas_thread_init(void){ BLASLONG i; if (blas_server_avail || (blas_cpu_number <= 1)) return 0; - + LOCK_COMMAND(&server_lock); #ifdef SMP_DEBUG - fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", + fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", blas_cpu_number); #endif @@ -317,11 +317,11 @@ int blas_thread_init(void){ pool.queue = NULL; for(i = 0; i < blas_cpu_number - 1; i++){ - blas_threads[i] = CreateThread(NULL, 0, + blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, 0, &blas_threads_id[i]); } - + blas_server_avail = 1; } @@ -330,7 +330,7 @@ int blas_thread_init(void){ return 0; } -/* +/* User can call one of two routines. exec_blas_async ... immediately returns after jobs are queued. @@ -387,7 +387,7 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ #endif WaitForSingleObject(queue->finish, INFINITE); - + CloseHandle(queue->finish); DeleteCriticalSection(&queue -> lock); @@ -414,7 +414,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); routine = queue -> routine; - + if (!(queue -> mode & BLAS_LEGACY)) { (routine)(queue -> args, queue -> range_m, queue -> range_n, queue -> sa, queue -> sb, 0); @@ -435,7 +435,7 @@ int BLASFUNC(blas_thread_shutdown)(void){ int i; if (!blas_server_avail) return 0; - + LOCK_COMMAND(&server_lock); if (blas_server_avail){ @@ -446,12 +446,12 @@ int BLASFUNC(blas_thread_shutdown)(void){ WaitForSingleObject(blas_threads[i], 5); //INFINITE); TerminateThread(blas_threads[i],0); } - + blas_server_avail = 0; } - + UNLOCK_COMMAND(&server_lock); - + return 0; } @@ -466,7 +466,7 @@ void goto_set_num_threads(int num_threads) if (num_threads > blas_num_threads) { LOCK_COMMAND(&server_lock); - + //increased_threads = 1; if (!blas_server_avail){ @@ -478,14 +478,14 @@ void goto_set_num_threads(int num_threads) pool.queue = NULL; blas_server_avail = 1; } - - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ - - blas_threads[i] = CreateThread(NULL, 0, + + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + + blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, 0, &blas_threads_id[i]); } - + blas_num_threads = num_threads; UNLOCK_COMMAND(&server_lock); diff --git a/driver/others/divtable.c b/driver/others/divtable.c index d50b091df..d801afb9b 100644 --- a/driver/others/divtable.c +++ b/driver/others/divtable.c @@ -41,23 +41,23 @@ #ifdef SMP #if !defined(USE64BITINT) || defined(ARCH_X86) unsigned int blas_quick_divide_table[] = { - 0x00000000, 0x00000001, 0x80000001, 0x55555556, - 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, - 0x20000001, 0x1c71c71d, 0x1999999a, 0x1745d175, - 0x15555556, 0x13b13b14, 0x12492493, 0x11111112, - 0x10000001, 0x0f0f0f10, 0x0e38e38f, 0x0d79435f, - 0x0ccccccd, 0x0c30c30d, 0x0ba2e8bb, 0x0b21642d, - 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, - 0x0924924a, 0x08d3dcb1, 0x08888889, 0x08421085, - 0x08000001, 0x07c1f07d, 0x07878788, 0x07507508, - 0x071c71c8, 0x06eb3e46, 0x06bca1b0, 0x06906907, - 0x06666667, 0x063e7064, 0x06186187, 0x05f417d1, - 0x05d1745e, 0x05b05b06, 0x0590b217, 0x0572620b, - 0x05555556, 0x0539782a, 0x051eb852, 0x05050506, - 0x04ec4ec5, 0x04d4873f, 0x04bda130, 0x04a7904b, - 0x04924925, 0x047dc120, 0x0469ee59, 0x0456c798, - 0x04444445, 0x04325c54, 0x04210843, 0x04104105, - 0x04000001, + 0x00000000, 0x00000001, 0x80000001, 0x55555556, + 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, + 0x20000001, 0x1c71c71d, 0x1999999a, 0x1745d175, + 0x15555556, 0x13b13b14, 0x12492493, 0x11111112, + 0x10000001, 0x0f0f0f10, 0x0e38e38f, 0x0d79435f, + 0x0ccccccd, 0x0c30c30d, 0x0ba2e8bb, 0x0b21642d, + 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, + 0x0924924a, 0x08d3dcb1, 0x08888889, 0x08421085, + 0x08000001, 0x07c1f07d, 0x07878788, 0x07507508, + 0x071c71c8, 0x06eb3e46, 0x06bca1b0, 0x06906907, + 0x06666667, 0x063e7064, 0x06186187, 0x05f417d1, + 0x05d1745e, 0x05b05b06, 0x0590b217, 0x0572620b, + 0x05555556, 0x0539782a, 0x051eb852, 0x05050506, + 0x04ec4ec5, 0x04d4873f, 0x04bda130, 0x04a7904b, + 0x04924925, 0x047dc120, 0x0469ee59, 0x0456c798, + 0x04444445, 0x04325c54, 0x04210843, 0x04104105, + 0x04000001, }; #else BLASULONG blas_quick_divide_table[] = { diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ec421d6de..a044343e5 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -95,7 +95,7 @@ int support_avx(){ #ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; - + cpuid(1, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ xgetbv(0, &eax, &edx); @@ -179,7 +179,7 @@ static gotoblas_t *get_coretype(void){ // Pentium (Clarkdale) / Pentium Mobile (Arrandale) // Xeon (Clarkdale), 32nm if (model == 5) return &gotoblas_NEHALEM; - + //Intel Xeon Processor 5600 (Westmere-EP) //Xeon Processor E7 (Westmere-EX) //Xeon E7540 @@ -250,7 +250,7 @@ static gotoblas_t *get_coretype(void){ } if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) { - if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; + if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; } else if (exfamily == 5) { return &gotoblas_BOBCAT; @@ -285,7 +285,7 @@ static gotoblas_t *get_coretype(void){ break; } } - + return NULL; } @@ -326,7 +326,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; if (gotoblas == &gotoblas_NEHALEM) return corename[10]; if (gotoblas == &gotoblas_ATHLON) return corename[11]; - if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; + if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; if (gotoblas == &gotoblas_OPTERON) return corename[13]; if (gotoblas == &gotoblas_BARCELONA) return corename[14]; if (gotoblas == &gotoblas_NANO) return corename[15]; @@ -359,7 +359,7 @@ static gotoblas_t *force_coretype(char *coretype){ { strncpy(mname,coretype,20); sprintf(message, "Core not found: %s\n",mname); - openblas_warning(1, message); + openblas_warning(1, message); return(NULL); } @@ -390,16 +390,16 @@ static gotoblas_t *force_coretype(char *coretype){ return(NULL); } - - - + + + void gotoblas_dynamic_init(void) { - + char coremsg[128]; char coren[22]; char *p; - + if (gotoblas) return; @@ -412,7 +412,7 @@ void gotoblas_dynamic_init(void) { { gotoblas = get_coretype(); } - + #ifdef ARCH_X86 if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else @@ -427,21 +427,21 @@ void gotoblas_dynamic_init(void) { gotoblas = &gotoblas_PRESCOTT; } #endif - + if (gotoblas && gotoblas -> init) { strncpy(coren,gotoblas_corename(),20); sprintf(coremsg, "Core: %s\n",coren); - openblas_warning(2, coremsg); + openblas_warning(2, coremsg); gotoblas -> init(); } else { openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); exit(1); } - + } void gotoblas_dynamic_quit(void) { - + gotoblas = NULL; } diff --git a/driver/others/init.c b/driver/others/init.c index cbcf229fa..9c7524909 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -181,8 +181,8 @@ static inline int rcount(unsigned long number) { } /*** - Known issue: The number of CPUs/cores should less - than sizeof(unsigned long). On 64 bits, the limit + Known issue: The number of CPUs/cores should less + than sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. ***/ static inline void get_cpumap(int node, unsigned long * node_info) { @@ -197,14 +197,14 @@ static inline void get_cpumap(int node, unsigned long * node_info) { int k=0; sprintf(name, CPUMAP_NAME, node); - + infile = open(name, O_RDONLY); for(i=0; i<32; i++){ affinity[i] = 0; } if (infile != -1) { - + read(infile, cpumap, sizeof(cpumap)); for(i=0; i<160; i++){ @@ -212,7 +212,7 @@ static inline void get_cpumap(int node, unsigned long * node_info) { break; if(cpumap[i] != ','){ name[k++]=cpumap[i]; - + //Enough data for Hex if(k >= NCPUBITS/4){ affinity[count++] = strtoul(name, &dummy, 16); @@ -249,7 +249,7 @@ static inline void get_share(int cpu, int level, unsigned long * share) { int bitmask_idx = 0; sprintf(name, SHARE_NAME, cpu, level); - + infile = open(name, O_RDONLY); // Init share @@ -260,7 +260,7 @@ static inline void get_share(int cpu, int level, unsigned long * share) { share[bitmask_idx] = CPUMASK(cpu); if (infile != -1) { - + read(infile, cpumap, sizeof(cpumap)); for(i=0; i<160; i++){ @@ -268,8 +268,8 @@ static inline void get_share(int cpu, int level, unsigned long * share) { break; if(cpumap[i] != ','){ name[k++]=cpumap[i]; - - //Enough data + + //Enough data if(k >= NCPUBITS/4){ affinity[count++] = strtoul(name, &dummy, 16); k=0; @@ -287,8 +287,8 @@ static inline void get_share(int cpu, int level, unsigned long * share) { for(i=0; i cpu_info[cpu]); #endif @@ -406,7 +406,7 @@ static void numa_mapping(void) { #ifdef DEBUG fprintf(stderr, "\nSorting ...\n\n"); - for (cpu = 0; cpu < count; cpu++) + for (cpu = 0; cpu < count; cpu++) fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); #endif @@ -453,12 +453,12 @@ static void disable_hyperthread(void) { share[i] &= common->avail[i]; if (popcount(share[i]) > 1) { - + #ifdef DEBUG fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", cpu, share[i] & ~(CPUMASK(cpu))); #endif - + common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); } } @@ -514,7 +514,7 @@ static void setup_mempolicy(void) { for (cpu = 0; cpu < numprocs; cpu ++) { mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[cpu]]); - + lnodemask |= (1UL << mynode); node_cpu[mynode] ++; @@ -527,11 +527,11 @@ static void setup_mempolicy(void) { for (cpu = 0; cpu < MAX_NODES; cpu ++) if ((node_cpu[cpu] != 0) && (node_cpu[cpu] != maxcpu)) node_equal = 0; if (lnodemask) { - + #ifdef DEBUG fprintf(stderr, "Node mask = %lx\n", lnodemask); #endif - + my_set_mempolicy(MPOL_INTERLEAVE, &lnodemask, sizeof(lnodemask) * 8); numnodes = popcount(lnodemask); @@ -551,11 +551,11 @@ static void open_shmem(void) { do { shmid = shmget(SH_MAGIC, 4096, 0666); - + if (shmid == -1) { shmid = shmget(SH_MAGIC, 4096, IPC_CREAT | 0666); } - + try ++; } while ((try < 10) && (shmid == -1)); @@ -599,7 +599,7 @@ static void local_cpu_map(void) { if (id > 0) { if (is_dead(id)) common -> cpu_use[cpu] = 0; } - + bitmask_idx = CPUELT(cpu); if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { @@ -611,9 +611,9 @@ static void local_cpu_map(void) { } cpu ++; - + } while ((mapping < numprocs) && (cpu < common -> final_num_procs)); - + disable_mapping = 0; if ((mapping < numprocs) || (numprocs == 1)) { @@ -622,7 +622,7 @@ static void local_cpu_map(void) { } disable_mapping = 1; } - + #ifdef DEBUG for (cpu = 0; cpu < numprocs; cpu ++) { fprintf(stderr, "Local Mapping : %2d --> %2d (%2d)\n", cpu, cpu_mapping[cpu], cpu_sub_mapping[cpu]); @@ -634,14 +634,14 @@ static void local_cpu_map(void) { int get_num_procs(void) { return numprocs; } int get_num_nodes(void) { return numnodes; } -int get_node_equal(void) { +int get_node_equal(void) { return (((blas_cpu_number % numnodes) == 0) && node_equal); - + } int gotoblas_set_affinity(int pos) { - + cpu_set_t cpu_mask; int mynode = 1; @@ -662,7 +662,7 @@ int gotoblas_set_affinity(int pos) { CPU_ZERO(&cpu_mask); CPU_SET (cpu_mapping[pos], &cpu_mask); - + sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); node_mapping[WhereAmI()] = mynode; @@ -672,7 +672,7 @@ int gotoblas_set_affinity(int pos) { return mynode; } -int get_node(void) { +int get_node(void) { if (!disable_mapping) return node_mapping[WhereAmI()]; @@ -694,7 +694,7 @@ void gotoblas_affinity_init(void) { initialized = 1; sched_getaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); - + #ifdef USE_OPENMP numprocs = 0; #else @@ -746,9 +746,9 @@ void gotoblas_affinity_init(void) { } for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; - + numa_check(); - + disable_hyperthread(); if (common -> num_nodes > 1) numa_mapping(); @@ -786,7 +786,7 @@ void gotoblas_affinity_init(void) { CPU_ZERO(&cpu_mask); CPU_SET (cpu_mapping[0], &cpu_mask); - + sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); node_mapping[WhereAmI()] = READ_NODE(common -> cpu_info[cpu_sub_mapping[0]]); @@ -817,13 +817,13 @@ void gotoblas_affinity_quit(void) { if ((numprocs == 1) || (initialized == 0)) return; if (!disable_mapping) { - + blas_lock(&common -> lock); - + for (i = 0; i < numprocs; i ++) common -> cpu_use[cpu_mapping[i]] = -1; - + blas_unlock(&common -> lock); - + } shmctl(shmid, IPC_STAT, &ds); diff --git a/driver/others/lamc3.c b/driver/others/lamc3.c index 439ef6e3d..acc4b505d 100644 --- a/driver/others/lamc3.c +++ b/driver/others/lamc3.c @@ -44,7 +44,7 @@ double FLOAT #endif NAME(FLOAT *a, FLOAT *b){ - + return *a + *b; } diff --git a/driver/others/lamch.c b/driver/others/lamch.c index b04450024..cdbc0eef5 100644 --- a/driver/others/lamch.c +++ b/driver/others/lamch.c @@ -152,7 +152,7 @@ double FLOAT #endif NAME(char *P){ - + char p = *P; int pos; FLOAT *hdata = (FLOAT *)idata; diff --git a/driver/others/memory.c b/driver/others/memory.c index 24a92034d..ba806b3a3 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -136,8 +136,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #ifdef DYNAMIC_ARCH gotoblas_t *gotoblas = NULL; @@ -171,32 +171,32 @@ int get_num_procs(void) { #ifdef OS_WINDOWS int get_num_procs(void) { - + static int nums = 0; if (nums == 0) { SYSTEM_INFO sysinfo; - + GetSystemInfo(&sysinfo); nums = sysinfo.dwNumberOfProcessors; } - + return nums; } #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) int get_num_procs(void) { - + static int nums = 0; int m[2]; size_t len; - + if (nums == 0) { m[0] = CTL_HW; m[1] = HW_NCPU; @@ -232,7 +232,7 @@ void set_stack_limit(int limitMB){ rl.rlim_cur=StackSize; result=setrlimit(RLIMIT_STACK, &rl); if(result !=0){ - fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result); + fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result); } } } @@ -241,12 +241,12 @@ void set_stack_limit(int limitMB){ #endif /* -OpenBLAS uses the numbers of CPU cores in multithreading. +OpenBLAS uses the numbers of CPU cores in multithreading. It can be set by openblas_set_num_threads(int num_threads); */ int blas_cpu_number = 0; /* -The numbers of threads in the thread pool. +The numbers of threads in the thread pool. This value is equal or large than blas_cpu_number. This means some threads are sleep. */ int blas_num_threads = 0; @@ -297,7 +297,7 @@ int blas_get_cpu_number(void){ if (p) blas_goto_num = atoi(p); if (blas_goto_num < 0) blas_goto_num = 0; } - + #endif blas_omp_num = 0; @@ -318,8 +318,8 @@ int blas_get_cpu_number(void){ #ifdef DEBUG printf( "Adjusted number of threads : %3d\n", blas_num_threads); #endif - - blas_cpu_number = blas_num_threads; + + blas_cpu_number = blas_num_threads; return blas_num_threads; } @@ -355,12 +355,12 @@ static void *alloc_mmap(void *address){ void *map_address; if (address){ - map_address = mmap(address, - BUFFER_SIZE, + map_address = mmap(address, + BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { - map_address = mmap(address, - BUFFER_SIZE, + map_address = mmap(address, + BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); } @@ -387,7 +387,7 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { BLASULONG original, *p; BLASULONG start, stop, min; int iter, i, count; - + min = (BLASULONG)-1; original = *(BLASULONG *)(address + size - PAGESIZE); @@ -397,20 +397,20 @@ static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { p = (BLASULONG *)address; - + count = size / PAGESIZE; - + start = rpcc(); - + for (i = 0; i < count; i ++) { p = (BLASULONG *)(*p); } - + stop = rpcc(); - + if (min > stop - start) min = stop - start; } - + *(BLASULONG *)(address + size - PAGESIZE + 0) = original; *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; @@ -442,11 +442,11 @@ static void *alloc_mmap(void *address){ } else { #endif - map_address = mmap(NULL, BUFFER_SIZE * SCALING, + map_address = mmap(NULL, BUFFER_SIZE * SCALING, MMAP_ACCESS, MMAP_POLICY, -1, 0); - + if (map_address != (void *)-1) { - + #ifdef OS_LINUX #ifdef DEBUG int ret=0; @@ -462,45 +462,45 @@ static void *alloc_mmap(void *address){ #endif #endif - + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); - + start = (BLASULONG)map_address; current = (SCALING - 1) * BUFFER_SIZE; - + while(current > 0) { *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; } - + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; - + start = (BLASULONG)map_address; - + best = (BLASULONG)-1; best_address = map_address; - + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { - + current = run_bench(start, allocsize); - + if (best > current) { best = current; best_address = (void *)start; } - + start += PAGESIZE; - + } - + if ((BLASULONG)best_address > (BLASULONG)map_address) munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); - + munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); - + map_address = best_address; - + #if defined(OS_LINUX) && !defined(NO_WARMUP) hot_alloc = 2; #endif @@ -632,7 +632,7 @@ static void alloc_devicedirver_free(struct release_t *release){ } static void *alloc_devicedirver(void *address){ - + int fd; void *map_address; @@ -646,7 +646,7 @@ static void *alloc_devicedirver(void *address){ PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); - + if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; @@ -671,9 +671,9 @@ static void alloc_shm_free(struct release_t *release){ static void *alloc_shm(void *address){ void *map_address; int shmid; - + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); - + map_address = (void *)shmat(shmid, address, 0); if (map_address != (void *)-1){ @@ -725,7 +725,7 @@ static void *alloc_hugetlb(void *address){ #if defined(OS_LINUX) || defined(OS_AIX) int shmid; - + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, #ifdef OS_LINUX SHM_HUGETLB | @@ -734,10 +734,10 @@ static void *alloc_hugetlb(void *address){ SHM_LGPAGE | SHM_PIN | #endif IPC_CREAT | SHM_R | SHM_W); - + if (shmid != -1) { map_address = (void *)shmat(shmid, address, SHM_RND); - + #ifdef OS_LINUX my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); #endif @@ -750,7 +750,7 @@ static void *alloc_hugetlb(void *address){ #ifdef __sun__ struct memcntl_mha mha; - + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; mha.mha_flags = 0; mha.mha_pagesize = HUGE_PAGESIZE; @@ -768,7 +768,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1; if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1; @@ -781,7 +781,7 @@ static void *alloc_hugetlb(void *address){ AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL); if (map_address == (void *)NULL) map_address = (void *)-1; - + #endif if (map_address != (void *)-1){ @@ -829,7 +829,7 @@ static void *alloc_hugetlbfile(void *address){ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - + if (map_address != (void *)-1) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; @@ -882,7 +882,7 @@ static void gotoblas_memory_init(void); /* 2 : Thread */ void *blas_memory_alloc(int procpos){ - + int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) int mypos; @@ -917,11 +917,11 @@ void *blas_memory_alloc(int procpos){ void *(**func)(void *address); if (!memory_initialized) { - + LOCK_COMMAND(&alloc_lock); - + if (!memory_initialized) { - + #if defined(WHEREAMI) && !defined(USE_OPENMP) for (position = 0; position < NUM_BUFFERS; position ++){ memory[position].addr = (void *)0; @@ -930,7 +930,7 @@ void *blas_memory_alloc(int procpos){ memory[position].lock = 0; } #endif - + #ifdef DYNAMIC_ARCH gotoblas_dynamic_init(); #endif @@ -938,11 +938,11 @@ void *blas_memory_alloc(int procpos){ #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) gotoblas_affinity_init(); #endif - + #ifdef SMP if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); #endif - + #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) #ifndef DYNAMIC_ARCH blas_set_parameter(); @@ -968,16 +968,16 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { - + blas_lock(&memory[position].lock); - + if (!memory[position].used) goto allocation; - + blas_unlock(&memory[position].lock); } - + position ++; - + } while (position < NUM_BUFFERS); @@ -987,18 +987,18 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used) { - + blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; - + blas_unlock(&memory[position].lock); } - + position ++; - + } while (position < NUM_BUFFERS); - + goto error; allocation : @@ -1055,13 +1055,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); - memory[position].addr = map_address; + memory[position].addr = map_address; #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); #endif } - + #if defined(WHEREAMI) && !defined(USE_OPENMP) if (memory[position].pos == -1) memory[position].pos = mypos; @@ -1071,18 +1071,18 @@ void *blas_memory_alloc(int procpos){ #ifdef DYNAMIC_ARCH if (memory_initialized == 1) { - + LOCK_COMMAND(&alloc_lock); - + if (memory_initialized == 1) { - + if (!gotoblas) gotoblas_dynamic_init(); - + memory_initialized = 2; } - + UNLOCK_COMMAND(&alloc_lock); - + } #endif @@ -1090,8 +1090,8 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf("Mapped : %p %3d\n\n", (void *)memory[position].addr, position); -#endif - +#endif + return (void *)memory[position].addr; error: @@ -1106,8 +1106,8 @@ void blas_memory_free(void *free_area){ #ifdef DEBUG printf("Unmapped Start : %p ...\n", free_area); -#endif - +#endif + position = 0; while ((memory[position].addr != free_area) @@ -1117,21 +1117,21 @@ void blas_memory_free(void *free_area){ #ifdef DEBUG printf(" Position : %d\n", position); -#endif +#endif memory[position].used = 0; #ifdef DEBUG printf("Unmap Succeeded.\n\n"); -#endif +#endif return; - + error: printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); #ifdef DEBUG - for (position = 0; position < NUM_BUFFERS; position++) + for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif @@ -1151,7 +1151,7 @@ void blas_shutdown(void){ for (pos = 0; pos < release_pos; pos ++) { release_info[pos].func(&release_info[pos]); } - + #ifdef SEEK_ADDRESS base_address = 0UL; #else @@ -1173,7 +1173,7 @@ void blas_shutdown(void){ } #if defined(OS_LINUX) && !defined(NO_WARMUP) - + #ifdef SMP #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; @@ -1184,7 +1184,7 @@ static BLASULONG init_lock = 0UL; #endif #endif -static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, +static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, void *sa, void *sb, BLASLONG pos) { #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) @@ -1247,7 +1247,7 @@ static void _init_thread_memory(void *buffer) { queue[num_cpu - 1].next = NULL; queue[0].sa = buffer; - + exec_blas(num_cpu, queue); } @@ -1266,15 +1266,15 @@ static void gotoblas_memory_init(void) { #ifdef SMP_SERVER if (blas_server_avail == 0) blas_thread_init(); #endif - + _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); - + #else - + _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); - + #endif - + blas_memory_free(buffer); } #endif diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 10b35aa31..17b7f5d60 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -58,12 +58,12 @@ void *sb = NULL; static double static_buffer[BUFFER_SIZE/sizeof(double)]; void *blas_memory_alloc(int numproc){ - + if (sa == NULL){ #if 1 - sa = (void *)qalloc(QFAST, BUFFER_SIZE); + sa = (void *)qalloc(QFAST, BUFFER_SIZE); #else - sa = (void *)malloc(BUFFER_SIZE); + sa = (void *)malloc(BUFFER_SIZE); #endif sb = (void *)&static_buffer[0]; } diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 581ab1a43..d8da2e398 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c index 68fe57449..ea2e4d986 100644 --- a/driver/others/openblas_get_parallel.c +++ b/driver/others/openblas_get_parallel.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -33,12 +33,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(USE_OPENMP) -static int parallel = 2 ; -#elif defined(SMP_SERVER) -static int parallel = 1; -#else -static int parallel = 0; -#endif +static int parallel = 2 ; +#elif defined(SMP_SERVER) +static int parallel = 1; +#else +static int parallel = 0; +#endif int CNAME() { return parallel; diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 5e24cfcc7..ea0c70a91 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 58e5fb11d..0d8d1e11a 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -177,26 +177,26 @@ int get_L2_size(void){ int i; cpuid(2, &eax, &ebx, &ecx, &edx); - + info[ 0] = BITMASK(eax, 8, 0xff); info[ 1] = BITMASK(eax, 16, 0xff); info[ 2] = BITMASK(eax, 24, 0xff); - + info[ 3] = BITMASK(ebx, 0, 0xff); info[ 4] = BITMASK(ebx, 8, 0xff); info[ 5] = BITMASK(ebx, 16, 0xff); info[ 6] = BITMASK(ebx, 24, 0xff); - + info[ 7] = BITMASK(ecx, 0, 0xff); info[ 8] = BITMASK(ecx, 8, 0xff); info[ 9] = BITMASK(ecx, 16, 0xff); info[10] = BITMASK(ecx, 24, 0xff); - + info[11] = BITMASK(edx, 0, 0xff); info[12] = BITMASK(edx, 8, 0xff); info[13] = BITMASK(edx, 16, 0xff); info[14] = BITMASK(edx, 24, 0xff); - + for (i = 0; i < 15; i++){ switch (info[i]){ @@ -284,7 +284,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_NORTHWOOD) +#if defined(CORE_NORTHWOOD) size >>= 7; #ifdef ALLOC_HUGETLB @@ -414,7 +414,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_OPTERON) +#if defined(CORE_OPTERON) sgemm_p = 224 + 14 * (size >> 5); dgemm_p = 112 + 14 * (size >> 6); cgemm_p = 116 + 14 * (size >> 6); @@ -469,7 +469,7 @@ void blas_set_parameter(void){ factor = atoi(p); if (factor < 10) factor = 10; if (factor > 200) factor = 200; - + sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L; dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L; cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L; @@ -479,7 +479,7 @@ void blas_set_parameter(void){ xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L; #endif } - + if (sgemm_p == 0) sgemm_p = 64; if (dgemm_p == 0) dgemm_p = 64; if (cgemm_p == 0) cgemm_p = 64; @@ -572,7 +572,7 @@ int get_current_cpu_info(void){ #if defined(ARCH_IA64) -static inline BLASULONG cpuid(BLASULONG regnum){ +static inline BLASULONG cpuid(BLASULONG regnum){ BLASULONG value; #ifndef __ECC @@ -587,11 +587,11 @@ static inline BLASULONG cpuid(BLASULONG regnum){ #if 1 void blas_set_parameter(void){ - + BLASULONG cpuid3, size; cpuid3 = cpuid(3); - + size = BITMASK(cpuid3, 16, 0xff); sgemm_p = 192 * (size + 1); @@ -625,7 +625,7 @@ void blas_set_parameter(void){ #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info" void blas_set_parameter(void){ - + BLASULONG cpuid3; int size = 0; @@ -643,17 +643,17 @@ void blas_set_parameter(void){ if (size <= 0) { if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) { - + while(fgets(buffer, sizeof(buffer), infile) != NULL) { if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break; } - + fgets(buffer, sizeof(buffer), infile); - + fclose(infile); - + *strstr(buffer, "bytes") = (char)NULL; - + size = atoi(strchr(buffer, ':') + 1) / 1572864; } } @@ -663,7 +663,7 @@ void blas_set_parameter(void){ if (size <= 0) { cpuid3 = cpuid(3); - + size = BITMASK(cpuid3, 16, 0xff) + 1; } @@ -692,7 +692,7 @@ void blas_set_parameter(void){ #endif -#if defined(ARCH_MIPS64) +#if defined(ARCH_MIPS64) void blas_set_parameter(void){ #if defined(LOONGSON3A) #ifdef SMP @@ -720,7 +720,7 @@ void blas_set_parameter(void){ dgemm_r = 160; } #endif -#endif +#endif } #endif diff --git a/driver/others/profile.c b/driver/others/profile.c index f464c0b6a..9fca09f06 100644 --- a/driver/others/profile.c +++ b/driver/others/profile.c @@ -75,13 +75,13 @@ void gotoblas_profile_quit(void) { fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n"); fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n"); - + for (i = 0; i < MAX_PROF_TABLE; i ++) { if (function_profile_table[i].calls) { #ifndef OS_WINDOWS - fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n", + fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n", #else - fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n", + fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n", #endif func_table[i], function_profile_table[i].calls, @@ -94,11 +94,11 @@ void gotoblas_profile_quit(void) { } fprintf(stderr, " --------------------------------------------------------------------\n"); - + #ifndef OS_WINDOWS - fprintf(stderr, "%-12s : %10Ld %10.3f%% %8.2f\n", + fprintf(stderr, "%-12s : %10Ld %10.3f%% %8.2f\n", #else - fprintf(stderr, "%-12s : %10lld %10.3f%% %8.2f\n", + fprintf(stderr, "%-12s : %10lld %10.3f%% %8.2f\n", #endif "Total", calls, diff --git a/driver/others/xerbla.c b/driver/others/xerbla.c index 6f5170ef1..7427b51c4 100644 --- a/driver/others/xerbla.c +++ b/driver/others/xerbla.c @@ -48,7 +48,7 @@ #ifdef __ELF__ int __xerbla(char *message, blasint *info, blasint length){ - + printf(" ** On entry to %6s parameter number %2d had an illegal value\n", message, *info); @@ -60,7 +60,7 @@ int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("_ #else int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){ - + printf(" ** On entry to %6s parameter number %2d had an illegal value\n", message, *info); diff --git a/exports/Makefile b/exports/Makefile index 70fde4f45..d563cd30d 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -163,10 +163,10 @@ goto64.$(SUFFIX) : ../$(LIBNAME) aix.def ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lxlf90 -lc -lm -lpthread else goto32.$(SUFFIX) : ../$(LIBNAME) aix.def - ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lg2c -lc -lm + ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lg2c -lc -lm goto64.$(SUFFIX) : ../$(LIBNAME) aix.def - ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lg2c -lc -lm + ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lg2c -lc -lm endif endif diff --git a/exports/dllinit.c b/exports/dllinit.c index 0f25824f1..02ff092e9 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -50,6 +50,6 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { if (reason == DLL_PROCESS_DETACH) { gotoblas_quit(); } - + return TRUE; } diff --git a/exports/gensymbol b/exports/gensymbol index 45566352f..6c21de455 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -92,17 +92,17 @@ @lapackobjs = ( # These routines are provided by OpenBLAS. - sgesv, dgesv, cgesv, zgesv, - sgetf2, dgetf2, cgetf2, zgetf2, - sgetrf, dgetrf, cgetrf, zgetrf, - slaswp, dlaswp, claswp, zlaswp, - sgetrs, dgetrs, cgetrs, zgetrs, - slauu2, dlauu2, clauu2, zlauu2, - slauum, dlauum, clauum, zlauum, - spotf2, dpotf2, cpotf2, zpotf2, - spotrf, dpotrf, cpotrf, zpotrf, - strti2, dtrti2, ctrti2, ztrti2, - strtri, dtrtri, ctrtri, ztrtri, + sgesv, dgesv, cgesv, zgesv, + sgetf2, dgetf2, cgetf2, zgetf2, + sgetrf, dgetrf, cgetrf, zgetrf, + slaswp, dlaswp, claswp, zlaswp, + sgetrs, dgetrs, cgetrs, zgetrs, + slauu2, dlauu2, clauu2, zlauu2, + slauum, dlauum, clauum, zlauum, + spotf2, dpotf2, cpotf2, zpotf2, + spotrf, dpotrf, cpotrf, zpotrf, + strti2, dtrti2, ctrti2, ztrti2, + strtri, dtrtri, ctrtri, ztrtri, spotri, dpotri, cpotri, zpotri, ); @@ -119,7 +119,7 @@ # ALLAUX -- Auxiliary routines called from all precisions # already provided by @blasobjs: xerbla, lsame ilaenv, ieeeck, lsamen, iparmq, - ilaprec, ilatrans, ilauplo, iladiag, + ilaprec, ilatrans, ilauplo, iladiag, ilaver, slamch, slamc3, # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. @@ -185,7 +185,7 @@ slaqtr, slar1v, slar2v, ilaslr, ilaslc, slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, slarrv, slartv, - slarz, slarzb, slarzt, slasy2, slasyf, + slarz, slarzb, slarzt, slasy2, slasyf, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, sopgtr, sopmtr, sorg2l, sorg2r, sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, @@ -366,8 +366,8 @@ dtfttr, dtpttf, dtpttr, dtrttf, dtrttp, dgejsv, dgesvj, dgsvj0, dgsvj1, dgeequb, dsyequb, dpoequb, dgbequb, - dbbcsd, dlapmr, dorbdb, dorbdb1, dorbdb2, dorbdb3, dorbdb4, - dorbdb5, dorbdb6, dorcsd, dorcsd2by1, + dbbcsd, dlapmr, dorbdb, dorbdb1, dorbdb2, dorbdb3, dorbdb4, + dorbdb5, dorbdb6, dorcsd, dorcsd2by1, dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, dtpqrt, dtpqrt2, dtpmqrt, dtprfb, @@ -440,8 +440,8 @@ zhfrk, ztfttp, zlanhf, zpftrf, zpftri, zpftrs, ztfsm, ztftri, ztfttr, ztpttf, ztpttr, ztrttf, ztrttp, zgeequb, zgbequb, zsyequb, zpoequb, zheequb, - zbbcsd, zlapmr, zunbdb, zunbdb1, zunbdb2, zunbdb3, zunbdb4, - zunbdb5, zunbdb6, zuncsd, zuncsd2by1, + zbbcsd, zlapmr, zunbdb, zunbdb1, zunbdb2, zunbdb3, zunbdb4, + zunbdb5, zunbdb6, zuncsd, zuncsd2by1, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, ); @@ -2705,20 +2705,20 @@ @lapack_embeded_underscore_objs=(xerbla_array, chla_transtype, slasyf_rook, ssytf2_rook, ssytrf_rook, ssytrs_rook, ssytri_rook, ssycon_rook, ssysv_rook, - chetf2_rook, chetrf_rook, chetri_rook, + chetf2_rook, chetrf_rook, chetri_rook, chetrs_rook, checon_rook, chesv_rook, - clahef_rook, clasyf_rook, - csytf2_rook, csytrf_rook, csytrs_rook, - csytri_rook, csycon_rook, csysv_rook, - dlasyf_rook, - dsytf2_rook, dsytrf_rook, dsytrs_rook, - dsytri_rook, dsycon_rook, dsysv_rook, - zhetf2_rook, zhetrf_rook, zhetri_rook, + clahef_rook, clasyf_rook, + csytf2_rook, csytrf_rook, csytrs_rook, + csytri_rook, csycon_rook, csysv_rook, + dlasyf_rook, + dsytf2_rook, dsytrf_rook, dsytrs_rook, + dsytri_rook, dsycon_rook, dsysv_rook, + zhetf2_rook, zhetrf_rook, zhetri_rook, zhetrs_rook, zhecon_rook, zhesv_rook, zlahef_rook, zlasyf_rook, zsytf2_rook, zsytrf_rook, zsytrs_rook, zsytri_rook, zsycon_rook, zsysv_rook, - + ); @@ -2775,7 +2775,7 @@ if ($ARGV[6] == 1) { } @hplobjs = (daxpy, dcopy, dscal, idamax, dgemv, dtrsv, dger, dgemm, dtrsm); -@hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T); +@hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T); $bu = $ARGV[2]; @@ -2800,7 +2800,7 @@ if ($ARGV[0] eq "osx"){ } # } exit(0); -} +} if ($ARGV[0] eq "aix"){ @@ -2821,11 +2821,11 @@ if ($ARGV[0] eq "aix"){ } # } exit(0); -} +} if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; - $count = 1; + $count = 1; @no_underscore_objs = (@no_underscore_objs, @misc_common_objs); @@ -2850,7 +2850,7 @@ if ($ARGV[0] eq "win2k"){ print "\t$uppercase=$objs", "__ \@", $count, "\n"; $count ++; } - + #for misc_common_objs foreach $objs (@misc_common_objs) { @@ -2861,19 +2861,19 @@ if ($ARGV[0] eq "win2k"){ print "\t$uppercase=$objs", "_ \@", $count, "\n"; $count ++; } - - + + foreach $objs (@no_underscore_objs) { print "\t",$objs,"=$objs"," \@", $count, "\n"; $count ++; } - + exit(0); } if ($ARGV[0] eq "win2khpl"){ print "EXPORTS\n"; - $count = 1; + $count = 1; foreach $objs (@hplobjs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; @@ -2898,7 +2898,7 @@ if ($ARGV[0] eq "microsoft"){ @underscore_objs = (@underscore_objs, @misc_common_objs); print "EXPORTS\n"; - $count = 1; + $count = 1; foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; diff --git a/f_check b/f_check index 86f1fa689..90ae2fe97 100644 --- a/f_check +++ b/f_check @@ -26,12 +26,12 @@ if ($compiler eq "") { @lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95", "sunf77", "sunf90", "sunf95", - "xlf95", "xlf90", "xlf", - "ppuf77", "ppuf95", "ppuf90", "ppuxlf", - "pathf90", "pathf95", + "xlf95", "xlf90", "xlf", + "ppuf77", "ppuf95", "ppuf90", "ppuxlf", + "pathf90", "pathf95", "pgf95", "pgf90", "pgf77", "ifort"); - + OUTER: foreach $lists (@lists) { foreach $path (@path) { @@ -129,43 +129,43 @@ if ($compiler eq "") { $bu = "_"; $openmp = ""; } - + if ($compiler =~ /g95/) { $vendor = G95; $bu = "_"; $openmp = ""; } - + if ($compiler =~ /gfortran/) { $vendor = GFORTRAN; $bu = "_"; $openmp = "-fopenmp"; } - + if ($compiler =~ /ifort/) { $vendor = INTEL; $bu = "_"; $openmp = "-openmp"; } - + if ($compiler =~ /pathf/) { $vendor = PATHSCALE; $bu = "_"; $openmp = "-mp"; } - + if ($compiler =~ /pgf/) { $vendor = PGI; $bu = "_"; $openmp = "-mp"; } - + if ($compiler =~ /ftn/) { $vendor = PGI; $bu = "_"; $openmp = "-openmp"; } - + if ($compiler =~ /frt/) { $vendor = FUJITSU; $bu = "_"; @@ -177,12 +177,12 @@ if ($compiler eq "") { $bu = "_"; $openmp = "-xopenmp=parallel"; } - + if ($compiler =~ /ppuf/) { $vendor = IBM; $openmp = "-openmp"; } - + if ($compiler =~ /xlf/) { $vendor = IBM; $openmp = "-openmp"; @@ -209,9 +209,9 @@ $data = `which $compiler > /dev/null 2> /dev/null`; if (!$?) { $binary = $ENV{"BINARY"}; - + $openmp = "" if $ENV{USE_OPENMP} != 1; - + if ($binary == 32) { $link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; if ($?) { @@ -223,7 +223,7 @@ if (!$?) { } $binary = "" if ($?); } - + if ($binary == 64) { $link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; if ($?) { @@ -235,12 +235,12 @@ if (!$?) { } $binary = "" if ($?); } - + if ($binary eq "") { $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; } } - + $linker_L = ""; $linker_l = ""; $linker_a = ""; @@ -268,11 +268,11 @@ if ($link ne "") { } $linker_L .= $flags . " "; } - + if ($flags =~ /^\-Y/) { $linker_L .= "-Wl,". $flags . " "; } - + if ($flags =~ /^\-rpath\@/) { $flags =~ s/\@/\,/g; if ($vendor eq "PGI") { @@ -288,9 +288,9 @@ if ($link ne "") { } $linker_L .= "-Wl,". $flags . " " ; } - + if ( - ($flags =~ /^\-l/) + ($flags =~ /^\-l/) && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) diff --git a/ftest.f b/ftest.f index 94ba566f5..73909c417 100644 --- a/ftest.f +++ b/ftest.f @@ -2,5 +2,5 @@ zhoge = (0.0d0,0.0d0) - return + return end diff --git a/ftest3.f b/ftest3.f index 8f2cd332f..82cba5099 100644 --- a/ftest3.f +++ b/ftest3.f @@ -2,5 +2,5 @@ zho_ge = (0.0d0,0.0d0) - return + return end diff --git a/getarch.c b/getarch.c index b100eb52f..234f7e172 100644 --- a/getarch.c +++ b/getarch.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -384,7 +384,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BARCELONA" #endif -#if defined(FORCE_BOBCAT) +#if defined(FORCE_BOBCAT) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -818,11 +818,11 @@ static int get_num_cores(void) { int m[2], count; size_t len; #endif - + #ifdef linux //returns the number of processors which are currently online return sysconf(_SC_NPROCESSORS_ONLN); - + #elif defined(OS_WINDOWS) GetSystemInfo(&sysinfo); @@ -855,7 +855,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); -#else +#else #if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) printf("CORE=%s\n", get_corename()); #endif diff --git a/getarch_2nd.c b/getarch_2nd.c index 0b140bba4..fad647fed 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -15,7 +15,7 @@ int main(int argc, char **argv) { printf("DGEMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); printf("QGEMM_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M); printf("QGEMM_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); - + printf("CGEMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M); printf("CGEMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N); printf("ZGEMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M); @@ -60,8 +60,8 @@ int main(int argc, char **argv) { #endif - } - + } + if ((argc >= 2) && (*argv[1] == '1')) { printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); diff --git a/interface/Makefile b/interface/Makefile index 51f9937b8..465d722b0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -154,7 +154,7 @@ QBLAS2OBJS = \ QBLAS3OBJS = \ qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ - qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ @@ -211,7 +211,7 @@ QBLAS2OBJS = \ QBLAS3OBJS = \ qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ - qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ @@ -298,7 +298,7 @@ CCBLAS2OBJS = \ cblas_cher.$(SUFFIX) cblas_cher2.$(SUFFIX) cblas_chpmv.$(SUFFIX) \ cblas_chpr.$(SUFFIX) cblas_chpr2.$(SUFFIX) cblas_ctbmv.$(SUFFIX) \ cblas_ctbsv.$(SUFFIX) cblas_ctpmv.$(SUFFIX) cblas_ctpsv.$(SUFFIX) \ - cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX) + cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX) CCBLAS3OBJS = \ cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ @@ -320,7 +320,7 @@ CZBLAS2OBJS = \ cblas_zher.$(SUFFIX) cblas_zher2.$(SUFFIX) cblas_zhpmv.$(SUFFIX) \ cblas_zhpr.$(SUFFIX) cblas_zhpr2.$(SUFFIX) cblas_ztbmv.$(SUFFIX) \ cblas_ztbsv.$(SUFFIX) cblas_ztpmv.$(SUFFIX) cblas_ztpsv.$(SUFFIX) \ - cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX) + cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX) CZBLAS3OBJS = \ cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ @@ -2029,28 +2029,28 @@ caxpby.$(SUFFIX) caxpby.$(PSUFFIX) : zaxpby.c cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c +domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) -somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c +somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) -comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c +comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) -zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c +zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) -dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c +dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) -simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c +simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) -cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c +cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) -zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c +zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) $< -o $(@F) diff --git a/interface/asum.c b/interface/asum.c index 634836e28..139398940 100644 --- a/interface/asum.c +++ b/interface/asum.c @@ -45,7 +45,7 @@ #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ - + BLASLONG n = *N; BLASLONG incx = *INCX; FLOATRET ret; @@ -70,7 +70,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - + FLOAT ret; PRINT_DEBUG_CNAME; diff --git a/interface/axpby.c b/interface/axpby.c index 7e6fcf4c1..63dba81a4 100644 --- a/interface/axpby.c +++ b/interface/axpby.c @@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) { - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; @@ -66,7 +66,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, FUNCTION_PROFILE_END(1, 2 * n, 2 * n); return; - + } diff --git a/interface/axpy.c b/interface/axpy.c index 5e288e3b9..61b7b4d78 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; @@ -85,12 +85,12 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; - + //Temporarily work-around the low performance issue with small imput size & //multithreads. if (n <= 10000) nthreads = 1; - + if (nthreads == 1) { #endif @@ -105,9 +105,9 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif - - blas_level1_thread(mode, n, 0, 0, &alpha, +#endif + + blas_level1_thread(mode, n, 0, 0, &alpha, x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); } @@ -118,5 +118,5 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc IDEBUG_END; return; - + } diff --git a/interface/copy.c b/interface/copy.c index 6965682ec..3fb2182a9 100644 --- a/interface/copy.c +++ b/interface/copy.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; @@ -70,11 +70,11 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ if (incy < 0) y -= (n - 1) * incy * COMPSIZE; COPY_K(n, x, incx, y, incy); - + FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0); IDEBUG_END; return; - + } diff --git a/interface/dot.c b/interface/dot.c index 3744db5ea..3a918405e 100644 --- a/interface/dot.c +++ b/interface/dot.c @@ -45,7 +45,7 @@ #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; @@ -74,7 +74,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ - + FLOAT ret; PRINT_DEBUG_CNAME; diff --git a/interface/dsdot.c b/interface/dsdot.c index 94237e0c4..32e4b492f 100644 --- a/interface/dsdot.c +++ b/interface/dsdot.c @@ -45,7 +45,7 @@ #ifndef CBLAS double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; @@ -69,7 +69,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ IDEBUG_END; return ret; - + } #else @@ -77,7 +77,7 @@ double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ double ret = 0.0; - + PRINT_DEBUG_CNAME; if (n <= 0) return 0; @@ -96,7 +96,7 @@ double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ IDEBUG_END; return ret; - + } #endif diff --git a/interface/gbmv.c b/interface/gbmv.c index a76c48d04..096c9f6f2 100644 --- a/interface/gbmv.c +++ b/interface/gbmv.c @@ -123,7 +123,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; - + trans = i; if (info != 0){ @@ -160,7 +160,7 @@ void CNAME(enum CBLAS_ORDER order, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + info = -1; if (incy == 0) info = 13; @@ -214,9 +214,9 @@ void CNAME(enum CBLAS_ORDER order, if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); - + if (alpha == ZERO) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -228,7 +228,7 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP nthreads = num_cpu_avail(2); - + if (nthreads == 1) { #endif diff --git a/interface/gemm.c b/interface/gemm.c index 9ce7fe526..07fea153c 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -110,7 +110,7 @@ void NAME(char *TRANSA, char *TRANSB, FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ - + blas_arg_t args; int transa, transb, nrowa, nrowb; @@ -128,7 +128,7 @@ void NAME(char *TRANSA, char *TRANSB, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -136,7 +136,7 @@ void NAME(char *TRANSA, char *TRANSB, int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -199,7 +199,7 @@ void NAME(char *TRANSA, char *TRANSB, if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; - if (args.lda < nrowa) info = 8; + if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; @@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FLOAT *alpha, #endif FLOAT *a, blasint lda, - FLOAT *b, blasint ldb, + FLOAT *b, blasint ldb, #ifndef COMPLEX FLOAT beta, #else @@ -244,7 +244,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -252,7 +252,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -278,15 +278,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS args.m = m; args.n = n; args.k = k; - + args.a = (void *)a; args.b = (void *)b; args.c = (void *)c; - + args.lda = lda; args.ldb = ldb; args.ldc = ldc; - + if (TransA == CblasNoTrans) transa = 0; if (TransA == CblasTrans) transa = 1; #ifndef COMPLEX @@ -305,7 +305,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS if (TransB == CblasConjNoTrans) transb = 2; if (TransB == CblasConjTrans) transb = 3; #endif - + nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; @@ -315,7 +315,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; - if (args.lda < nrowa) info = 8; + if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; @@ -327,11 +327,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS args.m = n; args.n = m; args.k = k; - + args.a = (void *)b; args.b = (void *)a; args.c = (void *)c; - + args.lda = ldb; args.ldb = lda; args.ldc = ldc; @@ -354,7 +354,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS if (TransA == CblasConjNoTrans) transb = 2; if (TransA == CblasConjTrans) transb = 3; #endif - + nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; @@ -364,7 +364,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; - if (args.lda < nrowa) info = 8; + if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; @@ -392,10 +392,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); buffer = (XFLOAT *)blas_memory_alloc(0); - + sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + #ifdef SMP mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); @@ -406,15 +406,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifndef COMPLEX double MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; + nthreads_max = 1; else { if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) { - nthreads_max = 4; + nthreads_max = 4; if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD ) { - nthreads_max = 2; + nthreads_max = 2; if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; @@ -428,15 +428,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #else double MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) - nthreads_max = 1; + nthreads_max = 1; else { if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) { - nthreads_max = 4; + nthreads_max = 4; if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) { - nthreads_max = 2; + nthreads_max = 2; if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; @@ -459,24 +459,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS if (args.nthreads == 1) { #endif - + (gemm[(transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); - + #ifdef SMP - + } else { - + #ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef NO_AFFINITY nodes = get_num_nodes(); - + if ((nodes > 1) && get_node_equal()) { - + args.nthreads /= nodes; - + gemm_thread_mn(mode, &args, NULL, NULL, gemm[16 | (transb << 2) | transa], sa, sb, nodes); - + } else { #endif @@ -485,21 +485,21 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #else GEMM_THREAD(mode, &args, NULL, NULL, gemm[(transb << 2) | transa], sa, sb, args.nthreads); - + #endif - + #ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef NO_AFFINITY } #endif #endif - + #endif - + #ifdef SMP } #endif - + blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.k + args.k * args.n + args.m * args.n, 2 * args.m * args.n * args.k); diff --git a/interface/gemv.c b/interface/gemv.c index 9ea8aa895..562ceee9f 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -85,7 +85,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; - + blasint info; blasint lenx, leny; blasint i; @@ -109,7 +109,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; - + trans = i; if (info != 0){ @@ -150,7 +150,7 @@ void CNAME(enum CBLAS_ORDER order, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + info = -1; if (incy == 0) info = 11; @@ -159,7 +159,7 @@ void CNAME(enum CBLAS_ORDER order, if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; - + } if (order == CblasRowMajor) { @@ -198,7 +198,7 @@ void CNAME(enum CBLAS_ORDER order, if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); - + if (alpha == ZERO) return; IDEBUG_START; @@ -215,17 +215,17 @@ void CNAME(enum CBLAS_ORDER order, if (nthreads == 1) { #endif - + (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); - + #ifdef SMP } else { - + (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); - + } #endif - + blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); diff --git a/interface/ger.c b/interface/ger.c index a07866c74..7c9cd425f 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -157,7 +157,7 @@ void CNAME(enum CBLAS_ORDER order, /* Quick return if possible. */ if (m == 0 || n == 0) return; if (alpha == 0.) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -178,9 +178,9 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMPBUG } else { - + GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); - + } #endif diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 4a86d83cd..3bc886f4f 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -54,7 +54,7 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, { char Order, Trans; - int order=-1,trans=-1; + int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; diff --git a/interface/imax.c b/interface/imax.c index 37396c7f8..55ffa7c6e 100644 --- a/interface/imax.c +++ b/interface/imax.c @@ -121,7 +121,7 @@ #ifndef CBLAS blasint NAME(blasint *N, FLOAT *x, blasint *INCX){ - + BLASLONG n = *N; BLASLONG incx = *INCX; blasint ret; @@ -146,7 +146,7 @@ blasint NAME(blasint *N, FLOAT *x, blasint *INCX){ #else CBLAS_INDEX CNAME(blasint n, FLOAT *x, blasint incx){ - + CBLAS_INDEX ret; PRINT_DEBUG_CNAME; diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index ce6bcbd0b..721da970d 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -71,7 +71,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.m = *N; @@ -121,18 +121,18 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, args.n = *N; info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); - + if (info == 0){ args.n = *NRHS; GETRS_N_SINGLE(&args, NULL, NULL, sa, sb, 0); } - + #ifdef SMP } else { args.n = *N; info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); - + if (info == 0){ args.n = *NRHS; GETRS_N_PARALLEL(&args, NULL, NULL, sa, sb, 0); diff --git a/interface/lapack/getf2.c b/interface/lapack/getf2.c index cae15953b..3e66c0403 100644 --- a/interface/lapack/getf2.c +++ b/interface/lapack/getf2.c @@ -60,7 +60,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.m = *M; @@ -81,7 +81,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info = 0; if (args.m == 0 || args.n == 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c index aa799e8d3..44a92ddc4 100644 --- a/interface/lapack/getrf.c +++ b/interface/lapack/getrf.c @@ -53,14 +53,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ blas_arg_t args; - + blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.m = *M; diff --git a/interface/lapack/getrs.c b/interface/lapack/getrs.c index 761a00160..1b8c83aca 100644 --- a/interface/lapack/getrs.c +++ b/interface/lapack/getrs.c @@ -105,7 +105,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return 0; } - + args.alpha = NULL; args.beta = NULL; @@ -148,5 +148,5 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, IDEBUG_END; return 0; - + } diff --git a/interface/lapack/larf.c.obsolete b/interface/lapack/larf.c.obsolete index 3b538c4a2..5e62c40cb 100644 --- a/interface/lapack/larf.c.obsolete +++ b/interface/lapack/larf.c.obsolete @@ -58,7 +58,7 @@ int NAME(char *SIDE, blasint *M, blasint *N, FLOAT *v, blasint *incV, FLOAT *tau char side_arg = *SIDE; int side; - + PRINT_DEBUG_NAME; TOUPPER(side_arg); @@ -77,7 +77,7 @@ int NAME(char *SIDE, blasint *M, blasint *N, FLOAT *v, blasint *incV, FLOAT *tau if (side_arg == 'R') side = 1; if (args.m == 0 || args.n == 0) return 0; - + #ifndef COMPLEX if (*tau == ZERO) return 0; #else diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c index 026b5156f..ebeb103e7 100644 --- a/interface/lapack/laswp.c +++ b/interface/lapack/laswp.c @@ -53,7 +53,7 @@ static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FL }; int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ - + blasint n = *N; blasint lda = *LDA; blasint k1 = *K1; @@ -93,10 +93,10 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif - blas_level1_thread(mode, n, k1, k2, dummyalpha, - a, lda, NULL, 0, ipiv, incx, + blas_level1_thread(mode, n, k1, k2, dummyalpha, + a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); } #endif @@ -106,5 +106,5 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * IDEBUG_END; return 0; - + } diff --git a/interface/lapack/lauu2.c b/interface/lapack/lauu2.c index 14417e986..3599a4791 100644 --- a/interface/lapack/lauu2.c +++ b/interface/lapack/lauu2.c @@ -72,7 +72,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; @@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n <= 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/lauum.c b/interface/lapack/lauum.c index e5b593f30..2c49eb0b0 100644 --- a/interface/lapack/lauum.c +++ b/interface/lapack/lauum.c @@ -78,7 +78,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); uplo = -1; @@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n == 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -118,7 +118,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #endif *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); - + #ifdef SMP } else { *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); diff --git a/interface/lapack/potf2.c b/interface/lapack/potf2.c index 76822a49c..837192265 100644 --- a/interface/lapack/potf2.c +++ b/interface/lapack/potf2.c @@ -72,7 +72,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; @@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n <= 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c index 9a15012d3..092272225 100644 --- a/interface/lapack/potrf.c +++ b/interface/lapack/potrf.c @@ -72,7 +72,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; @@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n == 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c index a4f33221a..d6230621f 100644 --- a/interface/lapack/potri.c +++ b/interface/lapack/potri.c @@ -80,7 +80,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; @@ -107,7 +107,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n == 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -137,11 +137,11 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP } else { info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); - + if (!info) { info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } - + *Info = info; } #endif diff --git a/interface/lapack/trti2.c b/interface/lapack/trti2.c index e119b45af..42c4c4815 100644 --- a/interface/lapack/trti2.c +++ b/interface/lapack/trti2.c @@ -73,13 +73,13 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); TOUPPER(diag_arg); @@ -92,7 +92,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In info = 0; if (args.lda < MAX(1,args.n)) info = 5; - if (args.n < 0) info = 3; + if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { @@ -104,7 +104,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In *Info = 0; if (args.n <= 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/trtri.c b/interface/lapack/trtri.c index 5aa5e9b9b..6724a678a 100644 --- a/interface/lapack/trtri.c +++ b/interface/lapack/trtri.c @@ -74,7 +74,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; @@ -95,7 +95,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In info = 0; if (args.lda < MAX(1,args.n)) info = 5; - if (args.n < 0) info = 3; + if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { @@ -107,7 +107,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In *Info = 0; if (args.n == 0) return 0; - + if (diag) { if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { *Info = IAMIN_K(args.n, args.a, args.lda + 1); @@ -133,12 +133,12 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In #endif *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); - + #ifdef SMP } else { *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); - + } #endif diff --git a/interface/lapack/zgetf2.c b/interface/lapack/zgetf2.c index 950ef46e9..59ec4874e 100644 --- a/interface/lapack/zgetf2.c +++ b/interface/lapack/zgetf2.c @@ -60,7 +60,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.m = *M; diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c index 9f041d9bd..5031f587b 100644 --- a/interface/lapack/zgetrf.c +++ b/interface/lapack/zgetrf.c @@ -60,7 +60,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.m = *M; diff --git a/interface/lapack/zgetrs.c b/interface/lapack/zgetrs.c index 81d50e34f..54d4b0905 100644 --- a/interface/lapack/zgetrs.c +++ b/interface/lapack/zgetrs.c @@ -105,7 +105,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return 0; } - + args.alpha = NULL; args.beta = NULL; @@ -139,7 +139,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, } #endif - + #ifndef PPC440 blas_memory_free(buffer); #endif @@ -149,5 +149,5 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, IDEBUG_END; return 0; - + } diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c index 85ead2c86..31e08451d 100644 --- a/interface/lapack/zlaswp.c +++ b/interface/lapack/zlaswp.c @@ -53,7 +53,7 @@ static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASL }; int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ - + blasint n = *N; blasint lda = *LDA; blasint k1 = *K1; @@ -94,7 +94,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint * mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); } diff --git a/interface/lapack/zlauu2.c b/interface/lapack/zlauu2.c index 05603fe1b..b0698ef2e 100644 --- a/interface/lapack/zlauu2.c +++ b/interface/lapack/zlauu2.c @@ -79,7 +79,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); uplo = -1; @@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n <= 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/zlauum.c b/interface/lapack/zlauum.c index 23990e8e4..4a36cc173 100644 --- a/interface/lapack/zlauum.c +++ b/interface/lapack/zlauum.c @@ -78,7 +78,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); uplo = -1; @@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n == 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -118,7 +118,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #endif *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); - + #ifdef SMP } else { diff --git a/interface/lapack/zpotf2.c b/interface/lapack/zpotf2.c index f8f81e2c5..27ee0891a 100644 --- a/interface/lapack/zpotf2.c +++ b/interface/lapack/zpotf2.c @@ -79,7 +79,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); uplo = -1; @@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n <= 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c index e2004d744..8cd3980d5 100644 --- a/interface/lapack/zpotrf.c +++ b/interface/lapack/zpotrf.c @@ -78,7 +78,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); uplo = -1; @@ -98,7 +98,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n == 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -115,7 +115,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { -#endif +#endif *Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0); diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c index b777c111f..7c72a7e62 100644 --- a/interface/lapack/zpotri.c +++ b/interface/lapack/zpotri.c @@ -80,7 +80,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; @@ -107,7 +107,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ *Info = 0; if (args.n == 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -136,11 +136,11 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP } else { info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); - + if (!info) { info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } - + *Info = info; } #endif diff --git a/interface/lapack/ztrti2.c b/interface/lapack/ztrti2.c index 017374c37..a25476677 100644 --- a/interface/lapack/ztrti2.c +++ b/interface/lapack/ztrti2.c @@ -73,13 +73,13 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); TOUPPER(diag_arg); @@ -92,7 +92,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In info = 0; if (args.lda < MAX(1,args.n)) info = 5; - if (args.n < 0) info = 3; + if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { @@ -104,7 +104,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In *Info = 0; if (args.n <= 0) return 0; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/lapack/ztrtri.c b/interface/lapack/ztrtri.c index 89caf80d3..b3ce85b9f 100644 --- a/interface/lapack/ztrtri.c +++ b/interface/lapack/ztrtri.c @@ -73,13 +73,13 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In extern #endif FLOAT *sa, *sb; - + PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; - + TOUPPER(uplo_arg); TOUPPER(diag_arg); @@ -92,7 +92,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In info = 0; if (args.lda < MAX(1,args.n)) info = 5; - if (args.n < 0) info = 3; + if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { @@ -104,7 +104,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In *Info = 0; if (args.n == 0) return 0; - + if (diag) { if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { *Info = IAMIN_K(args.n, args.a, args.lda + 1); @@ -131,12 +131,12 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In #endif *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); - + #ifdef SMP } else { *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); - + } #endif diff --git a/interface/max.c b/interface/max.c index 9bedaddd0..f05977448 100644 --- a/interface/max.c +++ b/interface/max.c @@ -121,7 +121,7 @@ #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ - + BLASLONG n = *N; BLASLONG incx = *INCX; FLOATRET ret; @@ -146,7 +146,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - + FLOAT ret; PRINT_DEBUG_CNAME; diff --git a/interface/nrm2.c b/interface/nrm2.c index ff8ef6d0d..cb4c8f6f4 100644 --- a/interface/nrm2.c +++ b/interface/nrm2.c @@ -45,7 +45,7 @@ #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ - + BLASLONG n = *N; BLASLONG incx = *INCX; FLOATRET ret; @@ -70,7 +70,7 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ - + FLOAT ret; PRINT_DEBUG_CNAME; diff --git a/interface/omatcopy.c b/interface/omatcopy.c index 1727e04ef..0c418b3c9 100644 --- a/interface/omatcopy.c +++ b/interface/omatcopy.c @@ -51,7 +51,7 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, { char Order, Trans; - int order=-1,trans=-1; + int order=-1,trans=-1; blasint info = -1; Order = *ORDER; diff --git a/interface/rot.c b/interface/rot.c index 2e458b12d..125275a18 100644 --- a/interface/rot.c +++ b/interface/rot.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; @@ -78,5 +78,5 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, F IDEBUG_END; return; - + } diff --git a/interface/rotm.c b/interface/rotm.c index 4f026c75d..9dc08354a 100644 --- a/interface/rotm.c +++ b/interface/rotm.c @@ -18,13 +18,13 @@ void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *d #endif blasint i__1, i__2; - + blasint i__; FLOAT w, z__; blasint kx, ky; FLOAT dh11, dh12, dh22, dh21, dflag; blasint nsteps; - + #ifndef CBLAS PRINT_DEBUG_CNAME; #else @@ -34,7 +34,7 @@ void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *d --dparam; --dy; --dx; - + dflag = dparam[1]; if (n <= 0 || dflag == - 2.0) goto L140; diff --git a/interface/rotmg.c b/interface/rotmg.c index c1f5777c3..1c41e14ef 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************************** * 2014/05/02 Saar -* fixed two bugs as reported by Brendan Tracey -* Test with lapack-3.5.0 : OK +* fixed two bugs as reported by Brendan Tracey +* Test with lapack-3.5.0 : OK * **************************************************************************************/ @@ -108,7 +108,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ if(dq2 < ZERO) { dflag = -ONE; - + dh11 = ZERO; dh12 = ZERO; dh21 = ZERO; @@ -130,7 +130,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ *dd2 = *dd1 / du; *dd1 = dtemp; *dx1 = dy1 * du; - } + } } @@ -169,7 +169,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ } } } - + if(*dd2 != ZERO) { while( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) ) @@ -203,7 +203,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ } } } - + } if(dflag < ZERO) diff --git a/interface/sbmv.c b/interface/sbmv.c index c481d5609..0dac736cb 100644 --- a/interface/sbmv.c +++ b/interface/sbmv.c @@ -75,7 +75,7 @@ static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT #ifndef CBLAS -void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; @@ -101,7 +101,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 11; @@ -115,7 +115,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, @@ -142,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incy == 0) info = 11; diff --git a/interface/scal.c b/interface/scal.c index c3e03c742..3f468a2a3 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ - + blasint n = *N; blasint incx = *INCX; FLOAT alpha = *ALPHA; @@ -53,7 +53,7 @@ void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ #else void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ - + #endif #ifdef SMP @@ -80,7 +80,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ if (n <= 1048576 ) nthreads = 1; - + if (nthreads == 1) { #endif @@ -93,11 +93,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif - +#endif + blas_level1_thread(mode, n, 0, 0, #ifndef CBLAS - ALPHA, + ALPHA, #else &alpha, #endif @@ -111,5 +111,5 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ IDEBUG_END; return; - + } diff --git a/interface/sdsdot.c b/interface/sdsdot.c index 168468c3a..6c457fa6c 100644 --- a/interface/sdsdot.c +++ b/interface/sdsdot.c @@ -45,14 +45,14 @@ #ifndef CBLAS FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOATRET ret; PRINT_DEBUG_NAME; - + if (n <= 0) return(*a) ; IDEBUG_START; @@ -69,13 +69,13 @@ FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint * IDEBUG_END; return ret; - + } #else FLOAT CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ - + FLOAT ret; PRINT_DEBUG_CNAME; diff --git a/interface/spmv.c b/interface/spmv.c index 3f853e56e..403458b06 100644 --- a/interface/spmv.c +++ b/interface/spmv.c @@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 9; @@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, @@ -138,7 +138,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incy == 0) info = 9; diff --git a/interface/spr.c b/interface/spr.c index aa2ff8f3f..1956986e9 100644 --- a/interface/spr.c +++ b/interface/spr.c @@ -75,7 +75,7 @@ static int (*spr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; @@ -97,7 +97,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incx == 0) info = 5; @@ -108,7 +108,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, @@ -133,7 +133,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incx == 0) info = 5; diff --git a/interface/spr2.c b/interface/spr2.c index e556d3fa8..73a811c3e 100644 --- a/interface/spr2.c +++ b/interface/spr2.c @@ -75,7 +75,7 @@ static int (*spr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLON #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ char uplo_arg = *UPLO; @@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 7; @@ -110,7 +110,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, @@ -136,7 +136,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incy == 0) info = 7; diff --git a/interface/swap.c b/interface/swap.c index 271fa083a..3baeb27c4 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ - + blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; @@ -78,12 +78,12 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #ifdef SMP nthreads = num_cpu_avail(1); - + //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; - + if (nthreads == 1) { #endif @@ -91,15 +91,15 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #ifdef SMP } else { - + #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif - +#endif + blas_level1_thread(mode, n, 0, 0, dummyalpha, x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); } @@ -111,5 +111,5 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ IDEBUG_END; return; - + } diff --git a/interface/symm.c b/interface/symm.c index b447f13e8..959a4ebbc 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -121,12 +121,12 @@ static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA #ifndef CBLAS -void NAME(char *SIDE, char *UPLO, - blasint *M, blasint *N, - FLOAT *alpha, FLOAT *a, blasint *ldA, - FLOAT *b, blasint *ldB, +void NAME(char *SIDE, char *UPLO, + blasint *M, blasint *N, + FLOAT *alpha, FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ - + char side_arg = *SIDE; char uplo_arg = *UPLO; @@ -143,7 +143,7 @@ void NAME(char *SIDE, char *UPLO, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -179,13 +179,13 @@ void NAME(char *SIDE, char *UPLO, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + args.m = *M; args.n = *N; args.c = (void *)c; args.ldc = *ldC; - + info = 0; if (args.ldc < MAX(1, args.m)) info = 12; @@ -193,17 +193,17 @@ void NAME(char *SIDE, char *UPLO, if (!side) { args.a = (void *)a; args.b = (void *)b; - + args.lda = *ldA; args.ldb = *ldB; - + if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; } else { args.a = (void *)b; args.b = (void *)a; - + args.lda = *ldB; args.ldb = *ldA; @@ -220,7 +220,7 @@ void NAME(char *SIDE, char *UPLO, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, @@ -254,7 +254,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -262,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -304,24 +304,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, if (!side) { args.a = (void *)a; args.b = (void *)b; - + args.lda = lda; args.ldb = ldb; - + if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; - + } else { args.a = (void *)b; args.b = (void *)a; - + args.lda = ldb; args.ldb = lda; - + if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } - + if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; @@ -345,24 +345,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, if (!side) { args.a = (void *)a; args.b = (void *)b; - + args.lda = lda; args.ldb = ldb; - + if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; - + } else { args.a = (void *)b; args.b = (void *)a; - + args.lda = ldb; args.ldb = lda; - + if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } - + if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; @@ -383,10 +383,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); - + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(3); @@ -402,25 +402,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #ifndef NO_AFFINITY nodes = get_num_nodes(); - + if (nodes > 1) { - + args.nthreads /= nodes; - - gemm_thread_mn(mode, &args, NULL, NULL, + + gemm_thread_mn(mode, &args, NULL, NULL, symm[4 | (side << 1) | uplo ], sa, sb, nodes); - + } else { #endif #ifndef USE_SIMPLE_THREADED_LEVEL3 - + (symm[4 | (side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); - + #else - + GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); - + #endif #ifndef NO_AFFINITY diff --git a/interface/symv.c b/interface/symv.c index e8c24df66..e4e300e20 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -53,7 +53,7 @@ #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; @@ -67,7 +67,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; - + #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, @@ -88,7 +88,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 10; @@ -101,10 +101,10 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else -void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { FLOAT *buffer; diff --git a/interface/syr.c b/interface/syr.c index 2b2d3d1e2..b29a81ec6 100644 --- a/interface/syr.c +++ b/interface/syr.c @@ -75,7 +75,7 @@ static int (*syr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; @@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (lda < MAX(1, n)) info = 7; @@ -139,7 +139,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; - + } if (order == CblasRowMajor) { @@ -161,7 +161,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, } #endif - + if (n == 0) return; if (alpha == ZERO) return; diff --git a/interface/syr2.c b/interface/syr2.c index 15dbae4bd..006567c82 100644 --- a/interface/syr2.c +++ b/interface/syr2.c @@ -75,7 +75,7 @@ static int (*syr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLON #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; @@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (lda < MAX(1, n)) info = 9; @@ -164,7 +164,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, } #endif - + if (n == 0) return; if (alpha == ZERO) return; @@ -188,7 +188,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, #ifdef SMP } else { - + (syr2_thread[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); } diff --git a/interface/syr2k.c b/interface/syr2k.c index 381e088a6..bfa5d8be4 100644 --- a/interface/syr2k.c +++ b/interface/syr2k.c @@ -82,11 +82,11 @@ static int (*syr2k[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BL #ifndef CBLAS void NAME(char *UPLO, char *TRANS, - blasint *N, blasint *K, - FLOAT *alpha, FLOAT *a, blasint *ldA, + blasint *N, blasint *K, + FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; @@ -103,7 +103,7 @@ void NAME(char *UPLO, char *TRANS, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -111,7 +111,7 @@ void NAME(char *UPLO, char *TRANS, int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -138,7 +138,7 @@ void NAME(char *UPLO, char *TRANS, TOUPPER(uplo_arg); TOUPPER(trans_arg); - + uplo = -1; trans = -1; @@ -150,7 +150,7 @@ void NAME(char *UPLO, char *TRANS, if (trans_arg == 'T') trans = 1; if (trans_arg == 'C') trans = 1; #else -#ifdef HEMM +#ifdef HEMM if (trans_arg == 'N') trans = 0; if (trans_arg == 'C') trans = 1; #else @@ -160,7 +160,7 @@ void NAME(char *UPLO, char *TRANS, #endif - + nrowa = args.n; if (trans & 1) nrowa = args.k; @@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -224,7 +224,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -273,10 +273,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif info = -1; - + nrowa = args.n; if (trans & 1) nrowa = args.k; - + if (args.ldc < MAX(1,args.n)) info = 12; if (args.ldb < MAX(1,nrowa)) info = 9; if (args.lda < MAX(1,nrowa)) info = 7; @@ -291,7 +291,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #ifdef HEMM CAlpha[0] = alpha[0]; CAlpha[1] = -alpha[1]; - + args.alpha = (void *)CAlpha; #endif @@ -310,10 +310,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif info = -1; - + nrowa = args.n; if (trans & 1) nrowa = args.k; - + if (args.ldc < MAX(1,args.n)) info = 12; if (args.ldb < MAX(1,nrowa)) info = 9; if (args.lda < MAX(1,nrowa)) info = 7; @@ -331,16 +331,16 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif if (args.n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); - + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + #ifdef SMP if (!trans){ mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); @@ -357,18 +357,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif (syr2k[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); - + #ifdef SMP } else { syrk_thread(mode, &args, NULL, NULL, syr2k[(uplo << 1) | trans ], sa, sb, args.nthreads); - + } #endif - + blas_memory_free(buffer); - + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, 2 * args.n * args.k + args.n * args.n, 2 * args.n * args.n * args.k); IDEBUG_END; diff --git a/interface/syrk.c b/interface/syrk.c index 072cc86f5..f8c697033 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -88,10 +88,10 @@ static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA #ifndef CBLAS void NAME(char *UPLO, char *TRANS, - blasint *N, blasint *K, - FLOAT *alpha, FLOAT *a, blasint *ldA, + blasint *N, blasint *K, + FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *beta, FLOAT *c, blasint *ldC){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; @@ -108,7 +108,7 @@ void NAME(char *UPLO, char *TRANS, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -116,7 +116,7 @@ void NAME(char *UPLO, char *TRANS, int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -154,7 +154,7 @@ void NAME(char *UPLO, char *TRANS, if (trans_arg == 'T') trans = 1; if (trans_arg == 'C') trans = 1; #else -#ifdef HEMM +#ifdef HEMM if (trans_arg == 'N') trans = 0; if (trans_arg == 'C') trans = 1; #else @@ -163,7 +163,7 @@ void NAME(char *UPLO, char *TRANS, #endif #endif - + nrowa = args.n; if (trans & 1) nrowa = args.k; @@ -180,7 +180,7 @@ void NAME(char *UPLO, char *TRANS, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, @@ -192,9 +192,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif FLOAT *a, blasint lda, #if !defined(COMPLEX) || defined(HEMM) - FLOAT beta, + FLOAT beta, #else - FLOAT *beta, + FLOAT *beta, #endif FLOAT *c, blasint ldc) { @@ -213,7 +213,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -264,10 +264,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif info = -1; - + nrowa = args.n; if (trans & 1) nrowa = args.k; - + if (args.ldc < MAX(1,args.n)) info = 10; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; @@ -292,10 +292,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif info = -1; - + nrowa = args.n; if (trans & 1) nrowa = args.k; - + if (args.ldc < MAX(1,args.n)) info = 10; if (args.lda < MAX(1,nrowa)) info = 7; if (args.k < 0) info = 4; @@ -312,25 +312,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif if (args.n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); - + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + #ifdef SMP if (!trans){ mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); } else { mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); } - + mode |= (uplo << BLAS_UPLO_SHIFT); - + args.common = NULL; args.nthreads = num_cpu_avail(3); @@ -344,13 +344,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr } else { #ifndef USE_SIMPLE_THREADED_LEVEL3 - + (syrk[4 | (uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); - + #else - + syrk_thread(mode, &args, NULL, NULL, syrk[(uplo << 1) | trans ], sa, sb, args.nthreads); - + #endif } diff --git a/interface/tbmv.c b/interface/tbmv.c index cec2be465..b5f3ab740 100644 --- a/interface/tbmv.c +++ b/interface/tbmv.c @@ -82,13 +82,13 @@ static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLAS #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, - blasint *N, blasint *K, + blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint k = *K; blasint lda = *LDA; @@ -167,7 +167,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -211,9 +211,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } #endif - + if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/tbsv.c b/interface/tbsv.c index a07c4c584..12a1eb003 100644 --- a/interface/tbsv.c +++ b/interface/tbsv.c @@ -67,13 +67,13 @@ static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, v #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, - blasint *N, blasint *K, + blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint k = *K; blasint lda = *LDA; @@ -146,7 +146,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -190,9 +190,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } #endif - + if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/tpmv.c b/interface/tpmv.c index f0fc4f71c..edf010492 100644 --- a/interface/tpmv.c +++ b/interface/tpmv.c @@ -83,11 +83,11 @@ static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint incx = *INCX; @@ -133,7 +133,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -162,7 +162,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -202,9 +202,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } #endif - + if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/tpsv.c b/interface/tpsv.c index 9dafd0b68..58be77cd3 100644 --- a/interface/tpsv.c +++ b/interface/tpsv.c @@ -68,11 +68,11 @@ static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint incx = *INCX; @@ -115,7 +115,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -141,7 +141,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -183,7 +183,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/trmv.c b/interface/trmv.c index ed23cedc6..2e52527a3 100644 --- a/interface/trmv.c +++ b/interface/trmv.c @@ -83,11 +83,11 @@ static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOA void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; @@ -135,7 +135,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -164,7 +164,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -208,7 +208,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -222,12 +222,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (nthreads == 1) { #endif - + (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); - + #ifdef SMP } else { - + (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); } diff --git a/interface/trsm.c b/interface/trsm.c index 5836ce2f0..266372988 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -87,18 +87,18 @@ static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA TRSM_LTUU, TRSM_LTUN, TRSM_LTLU, TRSM_LTLN, TRSM_LRUU, TRSM_LRUN, TRSM_LRLU, TRSM_LRLN, TRSM_LCUU, TRSM_LCUN, TRSM_LCLU, TRSM_LCLN, - TRSM_RNUU, TRSM_RNUN, TRSM_RNLU, TRSM_RNLN, + TRSM_RNUU, TRSM_RNUN, TRSM_RNLU, TRSM_RNLN, TRSM_RTUU, TRSM_RTUN, TRSM_RTLU, TRSM_RTLN, - TRSM_RRUU, TRSM_RRUN, TRSM_RRLU, TRSM_RRLN, + TRSM_RRUU, TRSM_RRUN, TRSM_RRLU, TRSM_RRLN, TRSM_RCUU, TRSM_RCUN, TRSM_RCLU, TRSM_RCLN, #else TRMM_LNUU, TRMM_LNUN, TRMM_LNLU, TRMM_LNLN, TRMM_LTUU, TRMM_LTUN, TRMM_LTLU, TRMM_LTLN, TRMM_LRUU, TRMM_LRUN, TRMM_LRLU, TRMM_LRLN, TRMM_LCUU, TRMM_LCUN, TRMM_LCLU, TRMM_LCLN, - TRMM_RNUU, TRMM_RNUN, TRMM_RNLU, TRMM_RNLN, + TRMM_RNUU, TRMM_RNUN, TRMM_RNLU, TRMM_RNLN, TRMM_RTUU, TRMM_RTUN, TRMM_RTLU, TRMM_RTLN, - TRMM_RRUU, TRMM_RRUN, TRMM_RRLU, TRMM_RRLN, + TRMM_RRUU, TRMM_RRUN, TRMM_RRLU, TRMM_RRLN, TRMM_RCUU, TRMM_RCUN, TRMM_RCLU, TRMM_RCLN, #endif }; @@ -108,7 +108,7 @@ static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, blasint *M, blasint *N, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB){ - + char side_arg = *SIDE; char uplo_arg = *UPLO; char trans_arg = *TRANS; @@ -127,7 +127,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -135,7 +135,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -182,7 +182,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + nrowa = args.m; if (side & 1) nrowa = args.n; @@ -201,10 +201,10 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else -void CNAME(enum CBLAS_ORDER order, +void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, enum CBLAS_DIAG Diag, blasint m, blasint n, @@ -231,7 +231,7 @@ void CNAME(enum CBLAS_ORDER order, int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -239,7 +239,7 @@ void CNAME(enum CBLAS_ORDER order, int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif #endif @@ -269,7 +269,7 @@ void CNAME(enum CBLAS_ORDER order, if (Side == CblasLeft) side = 0; if (Side == CblasRight) side = 1; - + if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; @@ -282,15 +282,15 @@ void CNAME(enum CBLAS_ORDER order, if (Trans == CblasConjNoTrans) trans = 2; if (Trans == CblasConjTrans) trans = 3; #endif - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; - + nrowa = args.m; if (side & 1) nrowa = args.n; - + if (args.ldb < MAX(1,args.m)) info = 11; if (args.lda < MAX(1,nrowa)) info = 9; if (args.n < 0) info = 6; @@ -307,7 +307,7 @@ void CNAME(enum CBLAS_ORDER order, if (Side == CblasLeft) side = 1; if (Side == CblasRight) side = 0; - + if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; @@ -320,15 +320,15 @@ void CNAME(enum CBLAS_ORDER order, if (Trans == CblasConjNoTrans) trans = 2; if (Trans == CblasConjTrans) trans = 3; #endif - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; - + nrowa = args.m; if (side & 1) nrowa = args.n; - + if (args.ldb < MAX(1,args.m)) info = 11; if (args.lda < MAX(1,nrowa)) info = 9; if (args.n < 0) info = 6; @@ -353,10 +353,10 @@ void CNAME(enum CBLAS_ORDER order, FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); - + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - + #ifdef SMP mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); @@ -367,7 +367,7 @@ void CNAME(enum CBLAS_ORDER order, #endif (trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit])(&args, NULL, NULL, sa, sb, 0); - + #ifdef SMP } else { if (!side) { @@ -377,10 +377,10 @@ void CNAME(enum CBLAS_ORDER order, } } #endif - + blas_memory_free(buffer); - FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, (!side) ? args.m * (args.m + args.n) : args.n * (args.m + args.n), (!side) ? args.m * args.m * args.n : args.m * args.n * args.n); diff --git a/interface/trsv.c b/interface/trsv.c index 8ef6998db..a054d8eeb 100644 --- a/interface/trsv.c +++ b/interface/trsv.c @@ -68,11 +68,11 @@ static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; @@ -103,7 +103,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incx == 0) info = 8; @@ -143,7 +143,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -187,7 +187,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/zaxpby.c b/interface/zaxpby.c index ff7510749..9e8324432 100644 --- a/interface/zaxpby.c +++ b/interface/zaxpby.c @@ -41,7 +41,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) { - + blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; @@ -66,9 +66,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT * if (incy < 0) y -= (n - 1) * incy * 2; AXPBY_K (n, alpha_r, alpha_i, x, incx, beta_r, beta_i, y, incy); - + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); return; - + } diff --git a/interface/zaxpy.c b/interface/zaxpy.c index 9ed72efb9..daa12bafb 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ - + blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; @@ -96,18 +96,18 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in #else AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); #endif - + #ifdef SMP } else { - + #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif - +#endif + blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, #ifndef CONJ (void *)AXPYU_K, @@ -117,11 +117,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in nthreads); } #endif - + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; return; - + } diff --git a/interface/zgbmv.c b/interface/zgbmv.c index ae1fd24bf..a18cede1c 100644 --- a/interface/zgbmv.c +++ b/interface/zgbmv.c @@ -135,7 +135,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; - + trans = i; if (info != 0){ @@ -178,7 +178,7 @@ void CNAME(enum CBLAS_ORDER order, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + info = -1; if (incy == 0) info = 13; @@ -234,7 +234,7 @@ void CNAME(enum CBLAS_ORDER order, if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/zgemv.c b/interface/zgemv.c index fb4784202..fcc2fda54 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -53,11 +53,11 @@ #ifdef SMP static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { #ifdef XDOUBLE - xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, xgemv_thread_d, + xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, xgemv_thread_d, #elif defined DOUBLE - zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, zgemv_thread_d, + zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, zgemv_thread_d, #else - cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, cgemv_thread_d, + cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, cgemv_thread_d, #endif }; #endif @@ -68,7 +68,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ - + char trans = *TRANS; blasint m = *M; blasint n = *N; @@ -86,7 +86,7 @@ void NAME(char *TRANS, blasint *M, blasint *N, GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; - + blasint info; blasint lenx, leny; blasint i; @@ -169,7 +169,7 @@ void CNAME(enum CBLAS_ORDER order, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + info = -1; if (incy == 0) info = 11; @@ -178,7 +178,7 @@ void CNAME(enum CBLAS_ORDER order, if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; - + } if (order == CblasRowMajor) { @@ -208,7 +208,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - + /* Quick return if possible. */ if (m == 0 || n == 0) return; @@ -237,13 +237,13 @@ void CNAME(enum CBLAS_ORDER order, if (nthreads == 1) { #endif - + (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP - + } else { - + (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } diff --git a/interface/zger.c b/interface/zger.c index 1865fb9d9..cefc839c2 100644 --- a/interface/zger.c +++ b/interface/zger.c @@ -129,7 +129,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, @@ -195,7 +195,7 @@ void CNAME(enum CBLAS_ORDER order, if (m == 0 || n == 0) return; if ((alpha_r == 0.) && (alpha_i == 0.)) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -245,5 +245,5 @@ void CNAME(enum CBLAS_ORDER order, IDEBUG_END; return; - + } diff --git a/interface/zhbmv.c b/interface/zhbmv.c index 00ba915db..8a16bbe28 100644 --- a/interface/zhbmv.c +++ b/interface/zhbmv.c @@ -75,7 +75,7 @@ static int (*hbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLO #ifndef CBLAS -void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; @@ -105,7 +105,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'V') uplo = 2; if (uplo_arg == 'M') uplo = 3; - + info = 0; if (incy == 0) info = 11; @@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, @@ -150,7 +150,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incy == 0) info = 11; diff --git a/interface/zhemv.c b/interface/zhemv.c index 3cba445c2..c60eedc57 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -53,7 +53,7 @@ #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; @@ -92,7 +92,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'V') uplo = 2; if (uplo_arg == 'M') uplo = 3; - + info = 0; if (incy == 0) info = 10; @@ -108,7 +108,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, #else -void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy) { FLOAT alpha_r = ALPHA[0]; diff --git a/interface/zher.c b/interface/zher.c index ad982dd68..9bedb0131 100644 --- a/interface/zher.c +++ b/interface/zher.c @@ -75,7 +75,7 @@ static int (*her_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; @@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (lda < MAX(1, n)) info = 7; @@ -139,7 +139,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; - + } if (order == CblasRowMajor) { @@ -161,7 +161,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, } #endif - + if (n == 0) return; if (alpha == ZERO) return; @@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); - + #ifdef SMP nthreads = num_cpu_avail(2); diff --git a/interface/zher2.c b/interface/zher2.c index 88fececf7..b342457a0 100644 --- a/interface/zher2.c +++ b/interface/zher2.c @@ -75,7 +75,7 @@ static int (*her2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; @@ -100,7 +100,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (lda < MAX(1, n)) info = 9; @@ -113,7 +113,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { @@ -193,7 +193,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA } else { (her2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); - + } #endif diff --git a/interface/zhpmv.c b/interface/zhpmv.c index d7013e668..bab6e5531 100644 --- a/interface/zhpmv.c +++ b/interface/zhpmv.c @@ -101,7 +101,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 9; @@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incy == 0) info = 9; diff --git a/interface/zhpr.c b/interface/zhpr.c index c48e35238..5159ba7e1 100644 --- a/interface/zhpr.c +++ b/interface/zhpr.c @@ -75,7 +75,7 @@ static int (*hpr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; @@ -97,7 +97,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incx == 0) info = 5; @@ -133,7 +133,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incx == 0) info = 5; @@ -158,7 +158,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - + if (n == 0) return; if (alpha == ZERO) return; @@ -170,7 +170,7 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); - + #ifdef SMP nthreads = num_cpu_avail(2); diff --git a/interface/zhpr2.c b/interface/zhpr2.c index cf1d5f9fc..1712e5d52 100644 --- a/interface/zhpr2.c +++ b/interface/zhpr2.c @@ -75,7 +75,7 @@ static int (*hpr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ char uplo_arg = *UPLO; @@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 7; @@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, @@ -139,7 +139,7 @@ void CNAME(enum CBLAS_ORDER order, if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; - + info = -1; if (incy == 0) info = 7; @@ -188,7 +188,7 @@ void CNAME(enum CBLAS_ORDER order, #endif (hpr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); - + #ifdef SMP } else { diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 90402d3c4..79af6d760 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -53,7 +53,7 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, { char Order, Trans; - int order=-1,trans=-1; + int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c index 819e57bab..eec4d3c1c 100644 --- a/interface/zomatcopy.c +++ b/interface/zomatcopy.c @@ -53,7 +53,7 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, { char Order, Trans; - int order=-1,trans=-1; + int order=-1,trans=-1; blasint info = -1; Order = *ORDER; diff --git a/interface/zrot.c b/interface/zrot.c index f18bbc6d1..1c45f685b 100644 --- a/interface/zrot.c +++ b/interface/zrot.c @@ -43,7 +43,7 @@ #endif void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ - + BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; @@ -68,5 +68,5 @@ void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C IDEBUG_END; return; - + } diff --git a/interface/zsbmv.c b/interface/zsbmv.c index 6d445d7ee..2efe85ba9 100644 --- a/interface/zsbmv.c +++ b/interface/zsbmv.c @@ -73,7 +73,7 @@ static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLO }; #endif -void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ char uplo_arg = *UPLO; @@ -101,7 +101,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 11; @@ -115,7 +115,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); @@ -123,7 +123,7 @@ void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint * if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; - + FUNCTION_PROFILE_START(); if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; diff --git a/interface/zscal.c b/interface/zscal.c index 1e6fdecdb..507d649bf 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ - + blasint n = *N; blasint incx = *INCX; @@ -104,8 +104,8 @@ void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){ mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif - +#endif + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); } @@ -116,5 +116,5 @@ void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){ IDEBUG_END; return; - + } diff --git a/interface/zspmv.c b/interface/zspmv.c index 65550872d..be11463c0 100644 --- a/interface/zspmv.c +++ b/interface/zspmv.c @@ -99,7 +99,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 9; @@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); diff --git a/interface/zspr.c b/interface/zspr.c index 0021bcda4..574b59aa2 100644 --- a/interface/zspr.c +++ b/interface/zspr.c @@ -73,7 +73,7 @@ static int (*spr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, FLOAT }; #endif -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; @@ -96,7 +96,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incx == 0) info = 5; @@ -107,7 +107,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; @@ -125,9 +125,9 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (nthreads == 1) { #endif - + (spr[uplo])(n, alpha_r, alpha_i, x, incx, a, buffer); - + #ifdef SMP } else { diff --git a/interface/zspr2.c b/interface/zspr2.c index b54e1651a..44c36d553 100644 --- a/interface/zspr2.c +++ b/interface/zspr2.c @@ -73,7 +73,7 @@ static int (*spr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL }; #endif -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ char uplo_arg = *UPLO; @@ -97,7 +97,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 7; @@ -109,7 +109,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; @@ -128,12 +128,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (nthreads == 1) { #endif - + (spr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); - + #ifdef SMP } else { - + (spr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads); } diff --git a/interface/zswap.c b/interface/zswap.c index 06a889204..fc62f7363 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -45,7 +45,7 @@ #ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ - + blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; @@ -99,8 +99,8 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif - +#endif + blas_level1_thread(mode, n, 0, 0, dummyalpha, x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); @@ -112,5 +112,5 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ IDEBUG_END; return; - + } diff --git a/interface/zsymv.c b/interface/zsymv.c index afb2c1734..1d6ff1f34 100644 --- a/interface/zsymv.c +++ b/interface/zsymv.c @@ -51,7 +51,7 @@ #define ERROR_NAME "CSYMV " #endif -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ char uplo_arg = *UPLO; @@ -88,7 +88,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incy == 0) info = 10; @@ -101,7 +101,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); @@ -127,12 +127,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, #ifdef SMP } else { - + (symv_thread[uplo])(n, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); - + } #endif - + blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); diff --git a/interface/zsyr.c b/interface/zsyr.c index b6b5202ec..5d62e8797 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -76,7 +76,7 @@ static int (*syr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLO #ifndef CBLAS -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; @@ -100,7 +100,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (lda < MAX(1, n)) info = 7; @@ -112,7 +112,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else @@ -142,7 +142,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; - + } if (order == CblasRowMajor) { diff --git a/interface/zsyr2.c b/interface/zsyr2.c index 0c705cb12..7c81c2093 100644 --- a/interface/zsyr2.c +++ b/interface/zsyr2.c @@ -73,7 +73,7 @@ static int (*syr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASL }; #endif -void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; @@ -98,7 +98,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (lda < MAX(1, n)) info = 9; @@ -111,7 +111,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; diff --git a/interface/ztbmv.c b/interface/ztbmv.c index 85f53c4be..0b6243125 100644 --- a/interface/ztbmv.c +++ b/interface/ztbmv.c @@ -94,13 +94,13 @@ static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLAS #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, - blasint *N, blasint *K, + blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint k = *K; blasint lda = *LDA; @@ -150,7 +150,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -179,7 +179,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -225,7 +225,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ztbsv.c b/interface/ztbsv.c index 3846a4b3d..8afd2afe7 100644 --- a/interface/ztbsv.c +++ b/interface/ztbsv.c @@ -73,13 +73,13 @@ static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, v #ifndef CBLAS void NAME(char *UPLO, char *TRANS, char *DIAG, - blasint *N, blasint *K, + blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint k = *K; blasint lda = *LDA; @@ -126,7 +126,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -152,7 +152,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -198,7 +198,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ztpmv.c b/interface/ztpmv.c index 2f9c48f5a..f9dfa75fb 100644 --- a/interface/ztpmv.c +++ b/interface/ztpmv.c @@ -95,7 +95,7 @@ static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; @@ -132,7 +132,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; if (incx == 0) info = 7; @@ -145,7 +145,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -174,7 +174,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ztpsv.c b/interface/ztpsv.c index fde500e37..c63e4d033 100644 --- a/interface/ztpsv.c +++ b/interface/ztpsv.c @@ -74,11 +74,11 @@ static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint incx = *INCX; @@ -121,7 +121,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -147,7 +147,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -189,7 +189,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 5a18a85b1..1abaac920 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -95,7 +95,7 @@ static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOA void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; @@ -147,7 +147,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, @@ -176,7 +176,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -220,7 +220,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); @@ -239,9 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #ifdef SMP } else { - + (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); - + } #endif diff --git a/interface/ztrsv.c b/interface/ztrsv.c index 08f7dc68c..ceac1727f 100644 --- a/interface/ztrsv.c +++ b/interface/ztrsv.c @@ -74,11 +74,11 @@ static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ - + char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; - + blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; @@ -109,7 +109,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; - + info = 0; @@ -124,7 +124,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } - + #else @@ -151,7 +151,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; - + if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; @@ -195,7 +195,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, #endif if (n == 0) return; - + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/kernel/Makefile b/kernel/Makefile index 55edcd287..a0a8fcd21 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -53,9 +53,9 @@ SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX) endif -KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h +KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h ifneq ($(NO_LAPACK), 1) -KERNEL_INTERFACE += ../common_lapack.h +KERNEL_INTERFACE += ../common_lapack.h endif ifeq ($(ARCH), x86) @@ -93,7 +93,7 @@ setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h setparam$(TSUFFIX).c : setparam-ref.c sed 's/TS/$(TSUFFIX)/g' $< > $(@F) -kernel$(TSUFFIX).h : $(KERNEL_INTERFACE) +kernel$(TSUFFIX).h : $(KERNEL_INTERFACE) sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 1153443c2..bd31503a4 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -435,19 +435,19 @@ endif ### AXPBY ### ifndef SAXPBYKERNEL -SAXPBYKERNEL = ../arm/axpby.c +SAXPBYKERNEL = ../arm/axpby.c endif ifndef DAXPBYKERNEL -DAXPBYKERNEL = ../arm/axpby.c +DAXPBYKERNEL = ../arm/axpby.c endif ifndef CAXPBYKERNEL -CAXPBYKERNEL = ../arm/zaxpby.c +CAXPBYKERNEL = ../arm/zaxpby.c endif ifndef ZAXPBYKERNEL -ZAXPBYKERNEL = ../arm/zaxpby.c +ZAXPBYKERNEL = ../arm/zaxpby.c endif SBLASOBJS += \ @@ -494,308 +494,308 @@ XBLASOBJS += \ -$(KDIR)samax_k$(TSUFFIX).$(SUFFIX) $(KDIR)samax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMAXKERNEL) +$(KDIR)samax_k$(TSUFFIX).$(SUFFIX) $(KDIR)samax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)damax_k$(TSUFFIX).$(SUFFIX) $(KDIR)damax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMAXKERNEL) +$(KDIR)damax_k$(TSUFFIX).$(SUFFIX) $(KDIR)damax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)qamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMAXKERNEL) +$(KDIR)qamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)camax_k$(TSUFFIX).$(SUFFIX) $(KDIR)camax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMAXKERNEL) +$(KDIR)camax_k$(TSUFFIX).$(SUFFIX) $(KDIR)camax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)zamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMAXKERNEL) +$(KDIR)zamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)xamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMAXKERNEL) +$(KDIR)xamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ ### AMIN ### -$(KDIR)samin_k$(TSUFFIX).$(SUFFIX) $(KDIR)samin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMINKERNEL) +$(KDIR)samin_k$(TSUFFIX).$(SUFFIX) $(KDIR)samin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)damin_k$(TSUFFIX).$(SUFFIX) $(KDIR)damin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMINKERNEL) +$(KDIR)damin_k$(TSUFFIX).$(SUFFIX) $(KDIR)damin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)qamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMINKERNEL) +$(KDIR)qamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)camin_k$(TSUFFIX).$(SUFFIX) $(KDIR)camin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMINKERNEL) +$(KDIR)camin_k$(TSUFFIX).$(SUFFIX) $(KDIR)camin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)zamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMINKERNEL) +$(KDIR)zamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)xamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMINKERNEL) +$(KDIR)xamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ ### MAX ### -$(KDIR)smax_k$(TSUFFIX).$(SUFFIX) $(KDIR)smax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMAXKERNEL) +$(KDIR)smax_k$(TSUFFIX).$(SUFFIX) $(KDIR)smax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)dmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMAXKERNEL) +$(KDIR)dmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)qmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMAXKERNEL) +$(KDIR)qmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ ### MIN ### -$(KDIR)smin_k$(TSUFFIX).$(SUFFIX) $(KDIR)smin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMINKERNEL) +$(KDIR)smin_k$(TSUFFIX).$(SUFFIX) $(KDIR)smin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)dmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMINKERNEL) +$(KDIR)dmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)qmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMINKERNEL) +$(KDIR)qmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ ### IAMAX ### -$(KDIR)isamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMAXKERNEL) +$(KDIR)isamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)idamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMAXKERNEL) +$(KDIR)idamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)iqamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMAXKERNEL) +$(KDIR)iqamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)icamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMAXKERNEL) +$(KDIR)icamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)izamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMAXKERNEL) +$(KDIR)izamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)ixamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMAXKERNEL) +$(KDIR)ixamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMAXKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ ### IAMIN ### -$(KDIR)isamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMINKERNEL) +$(KDIR)isamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)idamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMINKERNEL) +$(KDIR)idamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)iqamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMINKERNEL) +$(KDIR)iqamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)icamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMINKERNEL) +$(KDIR)icamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)izamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMINKERNEL) +$(KDIR)izamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)ixamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMINKERNEL) +$(KDIR)ixamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMINKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ ### IMAX ### -$(KDIR)ismax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMAXKERNEL) +$(KDIR)ismax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)idmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMAXKERNEL) +$(KDIR)idmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ -$(KDIR)iqmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMAXKERNEL) +$(KDIR)iqmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMAXKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ ### IMIN ### -$(KDIR)ismin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMINKERNEL) +$(KDIR)ismin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMINKERNEL) +$(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) +$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ -$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) +$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)dasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DASUMKERNEL) +$(KDIR)dasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QASUMKERNEL) +$(KDIR)qasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)casum_k$(TSUFFIX).$(SUFFIX) $(KDIR)casum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CASUMKERNEL) +$(KDIR)casum_k$(TSUFFIX).$(SUFFIX) $(KDIR)casum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZASUMKERNEL) +$(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) +$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) +$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) +$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) +$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) +$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ -$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) +$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ -$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) +$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ -$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) +$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ -$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) +$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ -$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) +$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ -$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) +$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ -$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) +$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ -$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) +$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ -$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) +$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ -$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) +$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ -$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) +$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ -$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) +$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) +$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) +$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ -$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) +$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) +$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@ -$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) +$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@ -$(KDIR)zdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) +$(KDIR)zdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ $< -o $@ -$(KDIR)xdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) +$(KDIR)xdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ $< -o $@ -$(KDIR)xdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) +$(KDIR)xdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ $< -o $@ -$(KDIR)cdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) +$(KDIR)cdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ $< -o $@ -$(KDIR)cdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) +$(KDIR)cdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ $< -o $@ -$(KDIR)snrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)snrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SNRM2KERNEL) +$(KDIR)snrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)snrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SNRM2KERNEL) $(CC) $(CFLAGS) -UCOMPLEX -c -UDOUBLE $< -o $@ -$(KDIR)dnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)dnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DNRM2KERNEL) +$(KDIR)dnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)dnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DNRM2KERNEL) $(CC) $(CFLAGS) -UCOMPLEX -c -DDOUBLE $< -o $@ -$(KDIR)qnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)qnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QNRM2KERNEL) +$(KDIR)qnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)qnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QNRM2KERNEL) $(CC) $(CFLAGS) -UCOMPLEX -c -DXDOUBLE $< -o $@ -$(KDIR)cnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)cnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CNRM2KERNEL) +$(KDIR)cnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)cnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CNRM2KERNEL) $(CC) $(CFLAGS) -DCOMPLEX -c -UDOUBLE $< -o $@ -$(KDIR)znrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)znrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZNRM2KERNEL) +$(KDIR)znrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)znrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZNRM2KERNEL) $(CC) $(CFLAGS) -DCOMPLEX -c -DDOUBLE $< -o $@ -$(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XNRM2KERNEL) +$(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XNRM2KERNEL) $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ -$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) +$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) +$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) +$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) +$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)zdrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZROTKERNEL) +$(KDIR)zdrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)xqrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)xqrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XROTKERNEL) +$(KDIR)xqrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)xqrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)sscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)sscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSCALKERNEL) +$(KDIR)sscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)sscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSCALKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)dscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)dscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSCALKERNEL) +$(KDIR)dscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)dscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSCALKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)qscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSCALKERNEL) +$(KDIR)qscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)qscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSCALKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)cscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)cscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSCALKERNEL) +$(KDIR)cscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)cscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSCALKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)zscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)zscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSCALKERNEL) +$(KDIR)zscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)zscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSCALKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)xscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)xscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSCALKERNEL) +$(KDIR)xscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)xscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSCALKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)sswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)sswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSWAPKERNEL) +$(KDIR)sswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)sswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSWAPKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)dswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)dswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSWAPKERNEL) +$(KDIR)dswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)dswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSWAPKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)qswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSWAPKERNEL) +$(KDIR)qswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)qswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSWAPKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)cswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)cswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSWAPKERNEL) +$(KDIR)cswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)cswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSWAPKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)zswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)zswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSWAPKERNEL) +$(KDIR)zswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)zswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSWAPKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSWAPKERNEL) +$(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSWAPKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ -$(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) +$(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ -$(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) +$(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) +$(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ -$(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) +$(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index ae4641132..2aeb8f041 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -219,210 +219,210 @@ XBLASOBJS += \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) -$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ -$(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@ -$(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ - -$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + +$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ -$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) +$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@ - -$(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL) + +$(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@ -$(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ -$(KDIR)cgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ -$(KDIR)cgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ -$(KDIR)cgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ -$(KDIR)cgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ -$(KDIR)cgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ -$(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ -$(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ -$(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ -$(KDIR)zgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ -$(KDIR)zgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ -$(KDIR)zgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ -$(KDIR)zgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ -$(KDIR)zgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ -$(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ -$(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) +$(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ -$(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) +$(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ -$(KDIR)xgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) +$(KDIR)xgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ -$(KDIR)xgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) +$(KDIR)xgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ -$(KDIR)xgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) +$(KDIR)xgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ -$(KDIR)xgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) +$(KDIR)xgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ -$(KDIR)xgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) +$(KDIR)xgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ -$(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) +$(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ -$(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) +$(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ -$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) +$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@ -$(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM) +$(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@ -$(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM) +$(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@ -$(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM) +$(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@ -$(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL) +$(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@ -$(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL) +$(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@ -$(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM) +$(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@ -$(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM) +$(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@ -$(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM) +$(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@ -$(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM) +$(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@ -$(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL) +$(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@ -$(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL) +$(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@ -$(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM) +$(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE $< -o $@ -$(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM) +$(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE $< -o $@ -$(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM) +$(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@ -$(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) +$(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@ -$(KDIR)cgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) +$(KDIR)cgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ $< -o $@ -$(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) +$(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ -DXCONJ $< -o $@ -$(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) +$(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@ -$(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) +$(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@ -$(KDIR)zgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) +$(KDIR)zgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ $< -o $@ -$(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) +$(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ -DXCONJ $< -o $@ -$(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) +$(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@ -$(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) +$(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@ -$(KDIR)xgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) +$(KDIR)xgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ $< -o $@ -$(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) +$(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@ -$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) +$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@ -$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) +$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ -$(KDIR)chemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_L_KERNEL) $(CHEMV_L_PARAM) +$(KDIR)chemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_L_KERNEL) $(CHEMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $@ -$(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_V_KERNEL) $(CHEMV_U_PARAM) ../symcopy.h +$(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_V_KERNEL) $(CHEMV_U_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ -$(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h +$(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ -$(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM) +$(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@ -$(KDIR)zhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_L_KERNEL) $(ZHEMV_L_PARAM) +$(KDIR)zhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_L_KERNEL) $(ZHEMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $@ -$(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_V_KERNEL) $(ZHEMV_U_PARAM) ../symcopy.h +$(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_V_KERNEL) $(ZHEMV_U_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ -$(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h +$(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ -$(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL) +$(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@ -$(KDIR)xhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_L_KERNEL) +$(KDIR)xhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_L_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $@ -$(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_V_KERNEL) ../symcopy.h +$(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_V_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ -$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h +$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 76857a2ba..be78dfc3d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -22,7 +22,7 @@ ifeq ($(ARCH), arm64) USE_TRMM = 1 endif -ifeq ($(TARGET), LOONGSON3B) +ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif @@ -333,13 +333,13 @@ CBLASOBJS += \ comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) + comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) + zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -535,7 +535,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ -ifdef USE_TRMM +ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ diff --git a/kernel/alpha/KERNEL b/kernel/alpha/KERNEL index a39ccd536..01734bf9c 100644 --- a/kernel/alpha/KERNEL +++ b/kernel/alpha/KERNEL @@ -74,8 +74,8 @@ SGEMMKERNEL = gemm_kernel_4x4.S SGEMM_BETA = gemm_beta.S SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4.S DGEMM_BETA = gemm_beta.S diff --git a/kernel/alpha/cnrm2.S b/kernel/alpha/cnrm2.S index 03343b2ae..bd1ab8782 100644 --- a/kernel/alpha/cnrm2.S +++ b/kernel/alpha/cnrm2.S @@ -75,7 +75,7 @@ .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 - + lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) @@ -85,7 +85,7 @@ #else PROFCODE #endif - + fclr a0 sll INCX, ZBASE_SHIFT, INCX fclr a1 diff --git a/kernel/alpha/dnrm2.S b/kernel/alpha/dnrm2.S index b8ccc75f6..0dfb64924 100644 --- a/kernel/alpha/dnrm2.S +++ b/kernel/alpha/dnrm2.S @@ -75,7 +75,7 @@ .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 - + lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) @@ -85,7 +85,7 @@ #else PROFCODE #endif - + fclr a0 SXADDQ INCX, 0, INCX fclr a1 diff --git a/kernel/alpha/gemm_kernel_4x4.S b/kernel/alpha/gemm_kernel_4x4.S index 4e9253488..c55d817df 100644 --- a/kernel/alpha/gemm_kernel_4x4.S +++ b/kernel/alpha/gemm_kernel_4x4.S @@ -167,7 +167,7 @@ sra N, 2, J ble J, $L40 .align 4 - + $L01: mov C, C1 addq C, LDC, C2 @@ -291,7 +291,7 @@ $L11: fclr c09 lda AO, 4 * SIZE(AO) fclr c10 -#endif +#endif lds $f31, 7 * SIZE(C4) fclr c14 @@ -1456,7 +1456,7 @@ $L40: fclr t1 addq C2, LDC, C fclr t2 - + #if defined(TRMMKERNEL) && defined(LEFT) mov OFFSET, KK #endif diff --git a/kernel/alpha/gemv_n.S b/kernel/alpha/gemv_n.S index 665b217a3..3e9d1d7fb 100644 --- a/kernel/alpha/gemv_n.S +++ b/kernel/alpha/gemv_n.S @@ -621,7 +621,7 @@ $L16: LD a1, 1 * SIZE(A1) LD a2, 0 * SIZE(A2) LD a3, 1 * SIZE(A2) - + LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) @@ -854,7 +854,7 @@ $L22: lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) lda A1, 8 * SIZE(A1) - lda Y1, 8 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) bgt I, $L22 .align 4 @@ -954,7 +954,7 @@ $L26: LD a1, 1 * SIZE(A1) LD a2, 0 * SIZE(A2) LD a3, 1 * SIZE(A2) - + LD y0, 0 * SIZE(Y1) LD y1, 1 * SIZE(Y1) @@ -1173,7 +1173,7 @@ $L36: LD a0, 0 * SIZE(A1) LD a1, 1 * SIZE(A1) - + LD y0, 0 * SIZE(Y1) MUL alpha1, a0, a0 LD y1, 1 * SIZE(Y1) diff --git a/kernel/alpha/iamax.S b/kernel/alpha/iamax.S index cb8763290..2be5d5d08 100644 --- a/kernel/alpha/iamax.S +++ b/kernel/alpha/iamax.S @@ -313,7 +313,7 @@ $L22: LD $f10, 0 * SIZE(XX) fabs $f14, $f22 addq XX, INCX, XX - cmpteq $f0, $f18, $f2 + cmpteq $f0, $f18, $f2 LD $f11, 0 * SIZE(XX) fabs $f15, $f23 @@ -376,7 +376,7 @@ $L22: $L23: fabs $f14, $f22 - cmpteq $f0, $f18, $f2 + cmpteq $f0, $f18, $f2 fabs $f15, $f23 cmpteq $f0, $f19, $f3 diff --git a/kernel/alpha/imax.S b/kernel/alpha/imax.S index b0cf5c8ab..d8958c86a 100644 --- a/kernel/alpha/imax.S +++ b/kernel/alpha/imax.S @@ -44,7 +44,7 @@ #define X $17 #define INCX $18 #define XX $19 - + #ifndef USE_MIN #define CMPLT(a, b) cmptlt a, b #else diff --git a/kernel/alpha/izamax.S b/kernel/alpha/izamax.S index 2269b12cc..c932581ae 100644 --- a/kernel/alpha/izamax.S +++ b/kernel/alpha/izamax.S @@ -235,7 +235,7 @@ $L13: fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 .align 4 - + $L14: addt $f8, $f9, $f16 addt $f10, $f11, $f17 diff --git a/kernel/alpha/snrm2.S b/kernel/alpha/snrm2.S index b8ccc75f6..0dfb64924 100644 --- a/kernel/alpha/snrm2.S +++ b/kernel/alpha/snrm2.S @@ -75,7 +75,7 @@ .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 - + lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) @@ -85,7 +85,7 @@ #else PROFCODE #endif - + fclr a0 SXADDQ INCX, 0, INCX fclr a1 diff --git a/kernel/alpha/trsm_kernel_4x4_LN.S b/kernel/alpha/trsm_kernel_4x4_LN.S index a1760c6f6..600b4e255 100644 --- a/kernel/alpha/trsm_kernel_4x4_LN.S +++ b/kernel/alpha/trsm_kernel_4x4_LN.S @@ -178,7 +178,7 @@ sra N, 2, J ble J, $L40 .align 4 - + $L01: #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 @@ -382,7 +382,7 @@ $L38: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 @@ -392,7 +392,7 @@ $L38: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 @@ -413,7 +413,7 @@ $L38: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 @@ -425,7 +425,7 @@ $L38: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b2, c05, t1 SUB c09, t1, c09 @@ -435,7 +435,7 @@ $L38: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a2, c09, t1 SUB c13, t1, c13 @@ -447,7 +447,7 @@ $L38: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a2, c13, t1 SUB c09, t1, c09 @@ -459,7 +459,7 @@ $L38: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b2, c09, t1 SUB c05, t1, c05 @@ -775,7 +775,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -796,7 +796,7 @@ $L28: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -843,7 +843,7 @@ $L28: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 @@ -870,7 +870,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 @@ -895,7 +895,7 @@ $L28: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b1, c06, c06 @@ -914,7 +914,7 @@ $L28: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a1, c10, c10 @@ -933,7 +933,7 @@ $L28: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a1, c14, c14 @@ -958,7 +958,7 @@ $L28: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b1, c10, c10 @@ -1163,7 +1163,7 @@ $L11: fclr c14 fclr c07 ble TMP1, $L18 -#endif +#endif ble L, $L15 .align 5 @@ -1490,7 +1490,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -1530,7 +1530,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -1572,7 +1572,7 @@ $L18: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a1, c08, c08 MUL a1, c12, c12 @@ -1611,7 +1611,7 @@ $L18: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b1, c07, c07 MUL b1, c11, c11 @@ -1667,7 +1667,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 @@ -1706,7 +1706,7 @@ $L18: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b1, c06, c06 MUL b1, c10, c10 @@ -1735,7 +1735,7 @@ $L18: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a1, c07, c07 MUL a1, c11, c11 @@ -1762,7 +1762,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -1801,7 +1801,7 @@ $L18: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b1, c06, c06 MUL b1, c07, c07 @@ -1830,7 +1830,7 @@ $L18: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 @@ -1857,7 +1857,7 @@ $L18: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a1, c14, c14 MUL a1, c15, c15 @@ -1896,7 +1896,7 @@ $L18: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b1, c10, c10 MUL b1, c11, c11 @@ -2093,7 +2093,7 @@ $L40: addq C2, LDC, C #endif fclr t2 - + #ifdef LN addq M, OFFSET, KK #endif @@ -2257,13 +2257,13 @@ $L78: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c05, c05 #endif @@ -2279,7 +2279,7 @@ $L78: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 @@ -2520,7 +2520,7 @@ $L68: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 @@ -2530,7 +2530,7 @@ $L68: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 @@ -2559,7 +2559,7 @@ $L68: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 @@ -2577,7 +2577,7 @@ $L68: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 @@ -2914,7 +2914,7 @@ $L58: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -2934,7 +2934,7 @@ $L58: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -2956,7 +2956,7 @@ $L58: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a1, c08, c08 @@ -2981,7 +2981,7 @@ $L58: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b1, c07, c07 @@ -3019,7 +3019,7 @@ $L58: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 @@ -3044,7 +3044,7 @@ $L58: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b1, c06, c06 @@ -3063,7 +3063,7 @@ $L58: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a1, c07, c07 @@ -3081,7 +3081,7 @@ $L58: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -3382,11 +3382,11 @@ $L118: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) - + SUB a1, c01, c01 #else LD a1, 0 * SIZE(AO) - + SUB a1, c01, c01 #endif @@ -3398,7 +3398,7 @@ $L118: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 #endif @@ -3593,13 +3593,13 @@ $L108: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 #endif @@ -3619,7 +3619,7 @@ $L108: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 @@ -3628,7 +3628,7 @@ $L108: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 #endif @@ -3886,7 +3886,7 @@ $L98: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -3896,7 +3896,7 @@ $L98: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -3908,7 +3908,7 @@ $L98: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a2, c04, t1 SUB c03, t1, c03 @@ -3920,7 +3920,7 @@ $L98: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b2, c03, t1 SUB c02, t1, c02 @@ -3942,7 +3942,7 @@ $L98: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 @@ -3954,7 +3954,7 @@ $L98: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b2, c02, t1 SUB c03, t1, c03 @@ -3964,7 +3964,7 @@ $L98: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a2, c03, t1 SUB c04, t1, c04 @@ -3973,7 +3973,7 @@ $L98: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 diff --git a/kernel/alpha/trsm_kernel_4x4_LT.S b/kernel/alpha/trsm_kernel_4x4_LT.S index 2848d2665..81436d034 100644 --- a/kernel/alpha/trsm_kernel_4x4_LT.S +++ b/kernel/alpha/trsm_kernel_4x4_LT.S @@ -178,7 +178,7 @@ sra N, 2, J ble J, $L40 .align 4 - + $L01: #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 @@ -313,7 +313,7 @@ $L11: fclr c14 fclr c07 ble TMP1, $L18 -#endif +#endif ble L, $L15 .align 5 @@ -640,7 +640,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -680,7 +680,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -722,7 +722,7 @@ $L18: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a1, c08, c08 MUL a1, c12, c12 @@ -761,7 +761,7 @@ $L18: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b1, c07, c07 MUL b1, c11, c11 @@ -817,7 +817,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 @@ -856,7 +856,7 @@ $L18: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b1, c06, c06 MUL b1, c10, c10 @@ -885,7 +885,7 @@ $L18: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a1, c07, c07 MUL a1, c11, c11 @@ -912,7 +912,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -951,7 +951,7 @@ $L18: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b1, c06, c06 MUL b1, c07, c07 @@ -980,7 +980,7 @@ $L18: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 @@ -1007,7 +1007,7 @@ $L18: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a1, c14, c14 MUL a1, c15, c15 @@ -1046,7 +1046,7 @@ $L18: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b1, c10, c10 MUL b1, c11, c11 @@ -1456,7 +1456,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -1477,7 +1477,7 @@ $L28: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -1524,7 +1524,7 @@ $L28: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 @@ -1551,7 +1551,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 @@ -1576,7 +1576,7 @@ $L28: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b1, c06, c06 @@ -1595,7 +1595,7 @@ $L28: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a1, c10, c10 @@ -1614,7 +1614,7 @@ $L28: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a1, c14, c14 @@ -1639,7 +1639,7 @@ $L28: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b1, c10, c10 @@ -1912,7 +1912,7 @@ $L38: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 @@ -1922,7 +1922,7 @@ $L38: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 @@ -1943,7 +1943,7 @@ $L38: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 @@ -1955,7 +1955,7 @@ $L38: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b2, c05, t1 SUB c09, t1, c09 @@ -1965,7 +1965,7 @@ $L38: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a2, c09, t1 SUB c13, t1, c13 @@ -1977,7 +1977,7 @@ $L38: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a2, c13, t1 SUB c09, t1, c09 @@ -1989,7 +1989,7 @@ $L38: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b2, c09, t1 SUB c05, t1, c05 @@ -2092,7 +2092,7 @@ $L40: addq C2, LDC, C #endif fclr t2 - + #ifdef LN addq M, OFFSET, KK #endif @@ -2359,7 +2359,7 @@ $L58: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -2379,7 +2379,7 @@ $L58: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -2401,7 +2401,7 @@ $L58: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a1, c08, c08 @@ -2426,7 +2426,7 @@ $L58: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b1, c07, c07 @@ -2464,7 +2464,7 @@ $L58: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 @@ -2489,7 +2489,7 @@ $L58: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b1, c06, c06 @@ -2508,7 +2508,7 @@ $L58: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a1, c07, c07 @@ -2526,7 +2526,7 @@ $L58: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -2827,7 +2827,7 @@ $L68: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 @@ -2837,7 +2837,7 @@ $L68: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 @@ -2866,7 +2866,7 @@ $L68: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 @@ -2884,7 +2884,7 @@ $L68: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 @@ -3117,13 +3117,13 @@ $L78: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c05, c05 #endif @@ -3139,7 +3139,7 @@ $L78: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 @@ -3455,7 +3455,7 @@ $L98: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -3465,7 +3465,7 @@ $L98: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -3477,7 +3477,7 @@ $L98: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a2, c04, t1 SUB c03, t1, c03 @@ -3489,7 +3489,7 @@ $L98: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b2, c03, t1 SUB c02, t1, c02 @@ -3511,7 +3511,7 @@ $L98: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 @@ -3523,7 +3523,7 @@ $L98: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b2, c02, t1 SUB c03, t1, c03 @@ -3533,7 +3533,7 @@ $L98: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a2, c03, t1 SUB c04, t1, c04 @@ -3542,7 +3542,7 @@ $L98: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -3759,13 +3759,13 @@ $L108: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 #endif @@ -3785,7 +3785,7 @@ $L108: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 @@ -3794,7 +3794,7 @@ $L108: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 #endif @@ -3977,11 +3977,11 @@ $L118: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) - + SUB a1, c01, c01 #else LD a1, 0 * SIZE(AO) - + SUB a1, c01, c01 #endif @@ -3993,7 +3993,7 @@ $L118: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 #endif diff --git a/kernel/alpha/trsm_kernel_4x4_RT.S b/kernel/alpha/trsm_kernel_4x4_RT.S index 6d3d2e39a..71d6c43fa 100644 --- a/kernel/alpha/trsm_kernel_4x4_RT.S +++ b/kernel/alpha/trsm_kernel_4x4_RT.S @@ -410,7 +410,7 @@ $L98: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -420,7 +420,7 @@ $L98: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -432,7 +432,7 @@ $L98: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a2, c04, t1 SUB c03, t1, c03 @@ -444,7 +444,7 @@ $L98: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b2, c03, t1 SUB c02, t1, c02 @@ -466,7 +466,7 @@ $L98: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 @@ -478,7 +478,7 @@ $L98: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b2, c02, t1 SUB c03, t1, c03 @@ -488,7 +488,7 @@ $L98: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a2, c03, t1 SUB c04, t1, c04 @@ -497,7 +497,7 @@ $L98: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -714,13 +714,13 @@ $L108: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 #endif @@ -740,7 +740,7 @@ $L108: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c02, t1, c02 @@ -749,7 +749,7 @@ $L108: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 #endif @@ -932,11 +932,11 @@ $L118: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) - + SUB a1, c01, c01 #else LD a1, 0 * SIZE(AO) - + SUB a1, c01, c01 #endif @@ -948,7 +948,7 @@ $L118: #if defined(RN) || defined(RT) LD a1, 0 * SIZE(BO) - + MUL a1, c01, c01 #endif @@ -1025,7 +1025,7 @@ $L40: addq C2, LDC, C #endif fclr t2 - + #ifdef LN addq M, OFFSET, KK #endif @@ -1292,7 +1292,7 @@ $L58: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -1312,7 +1312,7 @@ $L58: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -1334,7 +1334,7 @@ $L58: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a1, c08, c08 @@ -1359,7 +1359,7 @@ $L58: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b1, c07, c07 @@ -1397,7 +1397,7 @@ $L58: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 @@ -1422,7 +1422,7 @@ $L58: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b1, c06, c06 @@ -1441,7 +1441,7 @@ $L58: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a1, c07, c07 @@ -1459,7 +1459,7 @@ $L58: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -1760,7 +1760,7 @@ $L68: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c02, c02 @@ -1770,7 +1770,7 @@ $L68: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c05, c05 @@ -1799,7 +1799,7 @@ $L68: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 @@ -1817,7 +1817,7 @@ $L68: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 @@ -2050,13 +2050,13 @@ $L78: #if defined(LN) || defined(LT) LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c05, c05 #endif @@ -2072,7 +2072,7 @@ $L78: LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) LD a3, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 @@ -2156,7 +2156,7 @@ $L80: sra N, 2, J ble J, $L999 .align 4 - + $L01: #ifdef RT sll K, 2 + BASE_SHIFT, TMP1 @@ -2291,7 +2291,7 @@ $L11: fclr c14 fclr c07 ble TMP1, $L18 -#endif +#endif ble L, $L15 .align 5 @@ -2618,7 +2618,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -2658,7 +2658,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -2700,7 +2700,7 @@ $L18: LD a2, 14 * SIZE(AO) LD a3, 13 * SIZE(AO) LD a4, 12 * SIZE(AO) - + MUL a1, c04, c04 MUL a1, c08, c08 MUL a1, c12, c12 @@ -2739,7 +2739,7 @@ $L18: LD b1, 10 * SIZE(AO) LD b2, 9 * SIZE(AO) LD b3, 8 * SIZE(AO) - + MUL b1, c03, c03 MUL b1, c07, c07 MUL b1, c11, c11 @@ -2795,7 +2795,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 @@ -2834,7 +2834,7 @@ $L18: LD b1, 5 * SIZE(AO) LD b2, 6 * SIZE(AO) LD b3, 7 * SIZE(AO) - + MUL b1, c02, c02 MUL b1, c06, c06 MUL b1, c10, c10 @@ -2863,7 +2863,7 @@ $L18: LD a1, 10 * SIZE(AO) LD a2, 11 * SIZE(AO) LD a3, 15 * SIZE(AO) - + MUL a1, c03, c03 MUL a1, c07, c07 MUL a1, c11, c11 @@ -2890,7 +2890,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 MUL a1, c03, c03 @@ -2929,7 +2929,7 @@ $L18: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b1, c06, c06 MUL b1, c07, c07 @@ -2958,7 +2958,7 @@ $L18: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a1, c10, c10 MUL a1, c11, c11 @@ -2985,7 +2985,7 @@ $L18: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a1, c14, c14 MUL a1, c15, c15 @@ -3024,7 +3024,7 @@ $L18: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b1, c10, c10 MUL b1, c11, c11 @@ -3434,7 +3434,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -3455,7 +3455,7 @@ $L28: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -3502,7 +3502,7 @@ $L28: LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 3 * SIZE(AO) - + MUL a1, c01, c01 MUL a1, c05, c05 MUL a1, c09, c09 @@ -3529,7 +3529,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a1, c02, c02 @@ -3554,7 +3554,7 @@ $L28: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b1, c06, c06 @@ -3573,7 +3573,7 @@ $L28: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a1, c10, c10 @@ -3592,7 +3592,7 @@ $L28: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a1, c14, c14 @@ -3617,7 +3617,7 @@ $L28: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b1, c10, c10 @@ -3890,7 +3890,7 @@ $L38: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 @@ -3900,7 +3900,7 @@ $L38: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c05, c05 SUB a3, c09, c09 @@ -3921,7 +3921,7 @@ $L38: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + MUL a1, c01, c01 MUL a2, c01, t1 SUB c05, t1, c05 @@ -3933,7 +3933,7 @@ $L38: LD b1, 5 * SIZE(BO) LD b2, 6 * SIZE(BO) LD b3, 7 * SIZE(BO) - + MUL b1, c05, c05 MUL b2, c05, t1 SUB c09, t1, c09 @@ -3943,7 +3943,7 @@ $L38: LD a1, 10 * SIZE(BO) LD a2, 11 * SIZE(BO) LD a3, 15 * SIZE(BO) - + MUL a1, c09, c09 MUL a2, c09, t1 SUB c13, t1, c13 @@ -3955,7 +3955,7 @@ $L38: LD a2, 14 * SIZE(BO) LD a3, 13 * SIZE(BO) LD a4, 12 * SIZE(BO) - + MUL a1, c13, c13 MUL a2, c13, t1 SUB c09, t1, c09 @@ -3967,7 +3967,7 @@ $L38: LD b1, 10 * SIZE(BO) LD b2, 9 * SIZE(BO) LD b3, 8 * SIZE(BO) - + MUL b1, c09, c09 MUL b2, c09, t1 SUB c05, t1, c05 diff --git a/kernel/alpha/zamax.S b/kernel/alpha/zamax.S index 01fb4e118..f1ea18d2d 100644 --- a/kernel/alpha/zamax.S +++ b/kernel/alpha/zamax.S @@ -234,7 +234,7 @@ $L13: fcmovne $f6, $f18, $f2 fcmovne $f7, $f19, $f3 .align 4 - + $L14: addt $f8, $f9, $f16 addt $f10, $f11, $f17 diff --git a/kernel/alpha/zaxpy.S b/kernel/alpha/zaxpy.S index a6f3c1d2f..1416769a1 100644 --- a/kernel/alpha/zaxpy.S +++ b/kernel/alpha/zaxpy.S @@ -126,7 +126,7 @@ $MainLoop: LD $f0, 0*SIZE($18) MUL $f29, $f1, $f23 LD $f1, 1*SIZE($18) - + MUL $f29, $f2, $f24 unop MUL $f30, $f3, $f25 @@ -151,7 +151,7 @@ $MainLoop: addq $20, 8*SIZE, $20 MUL $f29, $f5, $f23 LD $f5, 5*SIZE($18) - + ADD $f16, $f8, $f16 LD $f8, 0*SIZE($20) MUL $f29, $f6, $f24 @@ -181,7 +181,7 @@ $MainLoop: ADD1 $f24, $f25, $f18 ST $f19,-5*SIZE($20) ADD2 $f26, $f27, $f19 - + ADD $f16, $f12, $f16 LD $f12, 4*SIZE($20) ADD $f17, $f13, $f17 @@ -207,7 +207,7 @@ $MainLoopEnd: MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 - + MUL $f29, $f2, $f24 MUL $f30, $f3, $f25 MUL $f30, $f2, $f26 @@ -222,7 +222,7 @@ $MainLoopEnd: MUL $f30, $f4, $f22 ADD2 $f26, $f27, $f19 MUL $f29, $f5, $f23 - + ADD $f16, $f8, $f16 MUL $f29, $f6, $f24 ADD $f17, $f28, $f17 @@ -242,7 +242,7 @@ $MainLoopEnd: ADD1 $f24, $f25, $f18 ST $f19, 3*SIZE($20) ADD2 $f26, $f27, $f19 - + ADD $f16, $f12, $f16 ADD $f17, $f13, $f17 ADD $f18, $f14, $f18 @@ -281,7 +281,7 @@ $RemainLoop: LD $f0, 0*SIZE($18) MUL $f29, $f1, $f23 LD $f1, 1*SIZE($18) - + ADD1 $f20, $f21, $f16 ADD2 $f22, $f23, $f17 ADD $f16, $f8, $f16 @@ -300,7 +300,7 @@ $RemainLoopEnd: MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 - + ADD1 $f20, $f21, $f16 ADD2 $f22, $f23, $f17 ADD $f16, $f8, $f16 @@ -326,11 +326,11 @@ $End: $Sub: SXSUBL $16, SIZE, $22 - addq $22, $22, $22 # Complex + addq $22, $22, $22 # Complex .align 4 - addq $19, $19, $19 # Complex - addq $21, $21, $21 # Complex + addq $19, $19, $19 # Complex + addq $21, $21, $21 # Complex ble $4, $SubRemain LD $f0, 0*SIZE($18) @@ -409,7 +409,7 @@ $SubMainLoop: unop MUL $f29, $f5, $f23 LD $f5, 1*SIZE($18) - + ADD $f16, $f8, $f16 LD $f8, 0*SIZE($24) MUL $f29, $f6, $f24 @@ -486,7 +486,7 @@ $SubMainLoopEnd: MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 - + MUL $f29, $f2, $f24 MUL $f30, $f3, $f25 MUL $f30, $f2, $f26 @@ -501,7 +501,7 @@ $SubMainLoopEnd: MUL $f30, $f4, $f22 ADD2 $f26, $f27, $f19 MUL $f29, $f5, $f23 - + ADD $f16, $f8, $f16 MUL $f29, $f6, $f24 ADD $f17, $f28, $f17 @@ -586,7 +586,7 @@ $SubRemainLoopEnd: MUL $f30, $f1, $f21 MUL $f30, $f0, $f22 MUL $f29, $f1, $f23 - + ADD1 $f20, $f21, $f16 ADD2 $f22, $f23, $f17 ADD $f16, $f8, $f16 diff --git a/kernel/alpha/zgemm_kernel_2x2.S b/kernel/alpha/zgemm_kernel_2x2.S index 33c50ddf8..67ba6d108 100644 --- a/kernel/alpha/zgemm_kernel_2x2.S +++ b/kernel/alpha/zgemm_kernel_2x2.S @@ -211,7 +211,7 @@ CNAME: sra N, 1, J ble J, $L30 .align 4 - + $L01: mov C, C1 addq C, LDC, C2 diff --git a/kernel/alpha/znrm2.S b/kernel/alpha/znrm2.S index 03343b2ae..bd1ab8782 100644 --- a/kernel/alpha/znrm2.S +++ b/kernel/alpha/znrm2.S @@ -75,7 +75,7 @@ .mask 0x4000000,-16 ldah $29, 0($27) !gpdisp!1 lda $29, 0($29) !gpdisp!1 - + lda $sp, -16($sp) ldq $27, sqrt($29) !literal!2 stq $26, 0($sp) @@ -85,7 +85,7 @@ #else PROFCODE #endif - + fclr a0 sll INCX, ZBASE_SHIFT, INCX fclr a1 diff --git a/kernel/alpha/ztrsm_kernel_2x2_LN.S b/kernel/alpha/ztrsm_kernel_2x2_LN.S index 2921f9e80..dcbe4e236 100644 --- a/kernel/alpha/ztrsm_kernel_2x2_LN.S +++ b/kernel/alpha/ztrsm_kernel_2x2_LN.S @@ -235,7 +235,7 @@ CNAME: sra N, 1, J ble J, $L30 .align 4 - + $L01: #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 @@ -524,7 +524,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 @@ -534,7 +534,7 @@ $L28: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 @@ -620,7 +620,7 @@ $L28: MUL a4, c09, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 - + LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) @@ -1116,7 +1116,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -1136,7 +1136,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -1193,7 +1193,7 @@ $L18: ADD5 c02, t2, c02 ADD6 c09, t3, c09 ADD5 c10, t4, c10 - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -1373,7 +1373,7 @@ $L18: ADD5 c02, t2, c02 ADD6 c03, t3, c03 ADD5 c04, t4, c04 - + LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) @@ -1709,7 +1709,7 @@ $L58: #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 #endif @@ -2043,7 +2043,7 @@ $L48: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -2053,7 +2053,7 @@ $L48: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -2083,7 +2083,7 @@ $L48: ADD6 c01, t1, c01 ADD5 c02, t2, c02 - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) diff --git a/kernel/alpha/ztrsm_kernel_2x2_LT.S b/kernel/alpha/ztrsm_kernel_2x2_LT.S index e6ffc0f92..e0c82026e 100644 --- a/kernel/alpha/ztrsm_kernel_2x2_LT.S +++ b/kernel/alpha/ztrsm_kernel_2x2_LT.S @@ -235,7 +235,7 @@ CNAME: sra N, 1, J ble J, $L30 .align 4 - + $L01: #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 @@ -697,7 +697,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -717,7 +717,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -774,7 +774,7 @@ $L18: ADD5 c02, t2, c02 ADD6 c09, t3, c09 ADD5 c10, t4, c10 - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -954,7 +954,7 @@ $L18: ADD5 c02, t2, c02 ADD6 c03, t3, c03 ADD5 c04, t4, c04 - + LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) @@ -1301,7 +1301,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 @@ -1311,7 +1311,7 @@ $L28: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 @@ -1397,7 +1397,7 @@ $L28: MUL a4, c09, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 - + LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) @@ -1771,7 +1771,7 @@ $L48: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -1781,7 +1781,7 @@ $L48: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -1811,7 +1811,7 @@ $L48: ADD6 c01, t1, c01 ADD5 c02, t2, c02 - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -2123,7 +2123,7 @@ $L58: #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 #endif diff --git a/kernel/alpha/ztrsm_kernel_2x2_RT.S b/kernel/alpha/ztrsm_kernel_2x2_RT.S index 4c490fc76..e890f599d 100644 --- a/kernel/alpha/ztrsm_kernel_2x2_RT.S +++ b/kernel/alpha/ztrsm_kernel_2x2_RT.S @@ -521,7 +521,7 @@ $L48: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -531,7 +531,7 @@ $L48: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c03, c03 @@ -561,7 +561,7 @@ $L48: ADD6 c01, t1, c01 ADD5 c02, t2, c02 - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -873,7 +873,7 @@ $L58: #else LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 #endif @@ -968,7 +968,7 @@ $L30: sra N, 1, J ble J, $L999 .align 4 - + $L01: #ifdef RT sll K, ZBASE_SHIFT + 1, TMP1 @@ -1430,7 +1430,7 @@ $L18: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + LD b1, 4 * SIZE(BO) LD b2, 5 * SIZE(BO) LD b3, 6 * SIZE(BO) @@ -1450,7 +1450,7 @@ $L18: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + LD b1, 4 * SIZE(AO) LD b2, 5 * SIZE(AO) LD b3, 6 * SIZE(AO) @@ -1507,7 +1507,7 @@ $L18: ADD5 c02, t2, c02 ADD6 c09, t3, c09 ADD5 c10, t4, c10 - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -1687,7 +1687,7 @@ $L18: ADD5 c02, t2, c02 ADD6 c03, t3, c03 ADD5 c04, t4, c04 - + LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) @@ -2034,7 +2034,7 @@ $L28: LD a2, 1 * SIZE(BO) LD a3, 2 * SIZE(BO) LD a4, 3 * SIZE(BO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 @@ -2044,7 +2044,7 @@ $L28: LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - + SUB a1, c01, c01 SUB a2, c02, c02 SUB a3, c09, c09 @@ -2130,7 +2130,7 @@ $L28: MUL a4, c09, t2 ADD6 c01, t1, c01 ADD5 c02, t2, c02 - + LD a1, 0 * SIZE(BO) LD a2, 1 * SIZE(BO) diff --git a/kernel/arm/KERNEL.ARMV5 b/kernel/arm/KERNEL.ARMV5 index ecf278cf9..27157dad1 100644 --- a/kernel/arm/KERNEL.ARMV5 +++ b/kernel/arm/KERNEL.ARMV5 @@ -85,13 +85,13 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 6edcf1c48..ae0e13093 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -16,7 +16,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c #STRMMKERNEL = ../generic/trmmkernel_2x2.c -#SGEMMKERNEL = ../generic/gemmkernel_2x2.c +#SGEMMKERNEL = ../generic/gemmkernel_2x2.c #SGEMMONCOPY = ../generic/gemm_ncopy_2.c #SGEMMOTCOPY = ../generic/gemm_tcopy_2.c @@ -109,7 +109,7 @@ DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S #CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = sgemm_incopy.o @@ -119,7 +119,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = dgemm_kernel_4x2_vfp.S +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = dgemm_incopy.o diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 790883e5e..aa8f681fc 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -86,28 +86,28 @@ CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -#SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S -SGEMMINCOPY = -SGEMMITCOPY = +#SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMINCOPY = -DGEMMITCOPY = +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S +DGEMMINCOPY = +DGEMMITCOPY = DGEMMONCOPY = dgemm_ncopy_4_vfp.S DGEMMOTCOPY = dgemm_tcopy_4_vfp.S -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c index 55107ca4f..ec6b11196 100644 --- a/kernel/arm/amax.c +++ b/kernel/arm/amax.c @@ -60,7 +60,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( ABS(x[ix]) > ABS(maxf) ) + if( ABS(x[ix]) > ABS(maxf) ) { maxf = ABS(x[ix]); } @@ -69,5 +69,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(maxf); } - + diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c index 3f7e97be6..fc89604d5 100644 --- a/kernel/arm/amin.c +++ b/kernel/arm/amin.c @@ -60,7 +60,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( ABS(x[ix]) < ABS(minf) ) + if( ABS(x[ix]) < ABS(minf) ) { minf = ABS(x[ix]); } @@ -69,5 +69,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(minf); } - + diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c index 5ac6936a0..5b6e6ebd2 100644 --- a/kernel/arm/asum.c +++ b/kernel/arm/asum.c @@ -63,5 +63,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(sumf); } - + diff --git a/kernel/arm/axpby.c b/kernel/arm/axpby.c index 51cfe1f46..278747f75 100644 --- a/kernel/arm/axpby.c +++ b/kernel/arm/axpby.c @@ -92,5 +92,5 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * return(0); } - + diff --git a/kernel/arm/axpy.c b/kernel/arm/axpy.c index dceddf78a..fb1094dd9 100644 --- a/kernel/arm/axpy.c +++ b/kernel/arm/axpy.c @@ -60,5 +60,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS return(0); } - + diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S index aaba7825e..874fcab9c 100644 --- a/kernel/arm/ccopy_vfp.S +++ b/kernel/arm/ccopy_vfp.S @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - + cmp N, #0 ble ccopy_kernel_L999 diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index b653888df..2ccda3397 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -187,7 +187,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - + vsub.f32 s0 , s0 , s0 vsub.f32 s1 , s1 , s1 vsub.f32 s2 , s2 , s2 @@ -269,11 +269,11 @@ cdot_kernel_L999: vldm r3, { s8 - s15} // restore floating point registers #if !defined(CONJ) - vsub.f32 s0 , s0, s2 - vadd.f32 s1 , s1, s3 + vsub.f32 s0 , s0, s2 + vadd.f32 s1 , s1, s3 #else - vadd.f32 s0 , s0, s2 - vsub.f32 s1 , s1, s3 + vadd.f32 s0 , s0, s2 + vsub.f32 s1 , s1, s3 #endif sub sp, fp, #24 diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index 75fbf097b..a059ef505 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -88,7 +88,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KMAC_R fnmacs #define KMAC_I fmacs @@ -834,7 +834,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble cgemm_kernel_L1_BEGIN cgemm_kernel_L2_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -903,7 +903,7 @@ cgemm_kernel_L2_M2_22: b cgemm_kernel_L2_M2_44 - + cgemm_kernel_L2_M2_30: tst L, #3 ble cgemm_kernel_L2_M2_40 @@ -968,7 +968,7 @@ cgemm_kernel_L2_M2_46: subs L, L, #1 bne cgemm_kernel_L2_M2_46 - + cgemm_kernel_L2_M2_100: SAVE2x2 @@ -1007,10 +1007,10 @@ cgemm_kernel_L2_M1_22: subs L, L, #1 bgt cgemm_kernel_L2_M1_22 - + cgemm_kernel_L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L2_M1_100 @@ -1020,7 +1020,7 @@ cgemm_kernel_L2_M1_42: subs L, L, #1 bgt cgemm_kernel_L2_M1_42 - + cgemm_kernel_L2_M1_100: SAVE1x2 @@ -1033,7 +1033,7 @@ cgemm_kernel_L2_END: lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 - + subs J , #1 // j-- bgt cgemm_kernel_L2_BEGIN @@ -1047,7 +1047,7 @@ cgemm_kernel_L1_BEGIN: tst J , #1 ble cgemm_kernel_L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1111,7 +1111,7 @@ cgemm_kernel_L1_M2_22: b cgemm_kernel_L1_M2_44 - + cgemm_kernel_L1_M2_30: tst L, #3 ble cgemm_kernel_L1_M2_40 @@ -1176,7 +1176,7 @@ cgemm_kernel_L1_M2_46: subs L, L, #1 bne cgemm_kernel_L1_M2_46 - + cgemm_kernel_L1_M2_100: SAVE2x1 @@ -1215,10 +1215,10 @@ cgemm_kernel_L1_M1_22: subs L, L, #1 bgt cgemm_kernel_L1_M1_22 - + cgemm_kernel_L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L1_M1_100 @@ -1228,7 +1228,7 @@ cgemm_kernel_L1_M1_42: subs L, L, #1 bgt cgemm_kernel_L1_M1_42 - + cgemm_kernel_L1_M1_100: SAVE1x1 diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 3aba68de8..8bc200c9f 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B_PRE 96 #define C_PRE 64 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubs #define FADD_I fadds @@ -891,7 +891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble cgemm_kernel_L1_BEGIN cgemm_kernel_L2_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -960,7 +960,7 @@ cgemm_kernel_L2_M2_22: b cgemm_kernel_L2_M2_44 - + cgemm_kernel_L2_M2_30: tst L, #3 ble cgemm_kernel_L2_M2_40 @@ -1025,7 +1025,7 @@ cgemm_kernel_L2_M2_46: subs L, L, #1 bne cgemm_kernel_L2_M2_46 - + cgemm_kernel_L2_M2_100: SAVE2x2 @@ -1064,10 +1064,10 @@ cgemm_kernel_L2_M1_22: subs L, L, #1 bgt cgemm_kernel_L2_M1_22 - + cgemm_kernel_L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L2_M1_100 @@ -1077,7 +1077,7 @@ cgemm_kernel_L2_M1_42: subs L, L, #1 bgt cgemm_kernel_L2_M1_42 - + cgemm_kernel_L2_M1_100: SAVE1x2 @@ -1090,7 +1090,7 @@ cgemm_kernel_L2_END: lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 - + subs J , #1 // j-- bgt cgemm_kernel_L2_BEGIN @@ -1104,7 +1104,7 @@ cgemm_kernel_L1_BEGIN: tst J , #1 ble cgemm_kernel_L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1168,7 +1168,7 @@ cgemm_kernel_L1_M2_22: b cgemm_kernel_L1_M2_44 - + cgemm_kernel_L1_M2_30: tst L, #3 ble cgemm_kernel_L1_M2_40 @@ -1233,7 +1233,7 @@ cgemm_kernel_L1_M2_46: subs L, L, #1 bne cgemm_kernel_L1_M2_46 - + cgemm_kernel_L1_M2_100: SAVE2x1 @@ -1272,10 +1272,10 @@ cgemm_kernel_L1_M1_22: subs L, L, #1 bgt cgemm_kernel_L1_M1_22 - + cgemm_kernel_L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble cgemm_kernel_L1_M1_100 @@ -1285,7 +1285,7 @@ cgemm_kernel_L1_M1_42: subs L, L, #1 bgt cgemm_kernel_L1_M1_42 - + cgemm_kernel_L1_M1_100: SAVE1x1 diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S index 08fbd5501..29eeab492 100644 --- a/kernel/arm/cgemm_ncopy_2_vfp.S +++ b/kernel/arm/cgemm_ncopy_2_vfp.S @@ -150,7 +150,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r4, fp, #128 vstm r4, { s8 - s15} // store floating point registers - ldr BO, B + ldr BO, B /*********************************************************************************************/ @@ -181,8 +181,8 @@ cgemm_ncopy_L2_M2_20: COPY2x2 subs I , I , #1 bne cgemm_ncopy_L2_M2_20 - - + + cgemm_ncopy_L2_M2_40: ands I, M , #1 @@ -194,7 +194,7 @@ cgemm_ncopy_L2_M2_60: subs I , I , #1 bne cgemm_ncopy_L2_M2_60 - + cgemm_ncopy_L2_M2_END: @@ -225,8 +225,8 @@ cgemm_ncopy_L1_M2_20: subs I , I , #1 bne cgemm_ncopy_L1_M2_20 - - + + cgemm_ncopy_L1_M2_40: ands I, M , #1 @@ -238,7 +238,7 @@ cgemm_ncopy_L1_M2_60: subs I , I , #1 bne cgemm_ncopy_L1_M2_60 - + cgemm_ncopy_L1_M2_END: diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 522c4c764..712e7f0d8 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -551,7 +551,7 @@ cgemvn_kernel_F1X1: ldr AO1, A add r3, AO1, #8 str r3, A - + ldr XO , X INIT_F1 @@ -651,7 +651,7 @@ cgemvn_kernel_S1X1: ldr AO1, A add r3, AO1, #8 str r3, A - + ldr XO , X INIT_S1 diff --git a/kernel/arm/copy.c b/kernel/arm/copy.c index f742a4a33..7b4f04f30 100644 --- a/kernel/arm/copy.c +++ b/kernel/arm/copy.c @@ -55,5 +55,5 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) return(0); } - + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index a68434f97..a48c8608d 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **************************************************************************************/ -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KMAC_R fnmacs #define KMAC_I fmacs @@ -848,7 +848,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble _L1_BEGIN _L2_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -951,7 +951,7 @@ _L2_M2_22: b _L2_M2_44 - + _L2_M2_30: tst L, #3 ble _L2_M2_40 @@ -1016,7 +1016,7 @@ _L2_M2_46: subs L, L, #1 bne _L2_M2_46 - + _L2_M2_100: SAVE2x2 @@ -1103,10 +1103,10 @@ _L2_M1_22: subs L, L, #1 bgt _L2_M1_22 - + _L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M1_100 @@ -1116,7 +1116,7 @@ _L2_M1_42: subs L, L, #1 bgt _L2_M1_42 - + _L2_M1_100: SAVE1x2 @@ -1147,7 +1147,7 @@ _L2_END: lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 - + #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO @@ -1167,7 +1167,7 @@ _L1_BEGIN: tst J , #1 ble _L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1265,7 +1265,7 @@ _L1_M2_22: b _L1_M2_44 - + _L1_M2_30: tst L, #3 ble _L1_M2_40 @@ -1330,7 +1330,7 @@ _L1_M2_46: subs L, L, #1 bne _L1_M2_46 - + _L1_M2_100: SAVE2x1 @@ -1418,10 +1418,10 @@ _L1_M1_22: subs L, L, #1 bgt _L1_M1_22 - + _L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M1_100 @@ -1431,7 +1431,7 @@ _L1_M1_42: subs L, L, #1 bgt _L1_M1_42 - + _L1_M1_100: SAVE1x1 diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 28e555caa..f06e260ea 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -84,7 +84,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B_PRE 96 #define C_PRE 64 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubs #define FADD_I fadds @@ -869,7 +869,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble _L1_BEGIN _L2_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -972,7 +972,7 @@ _L2_M2_22: b _L2_M2_44 - + _L2_M2_30: tst L, #3 ble _L2_M2_40 @@ -1037,7 +1037,7 @@ _L2_M2_46: subs L, L, #1 bne _L2_M2_46 - + _L2_M2_100: SAVE2x2 @@ -1124,10 +1124,10 @@ _L2_M1_22: subs L, L, #1 bgt _L2_M1_22 - + _L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M1_100 @@ -1137,7 +1137,7 @@ _L2_M1_42: subs L, L, #1 bgt _L2_M1_42 - + _L2_M1_100: SAVE1x2 @@ -1168,7 +1168,7 @@ _L2_END: lsl r4, r4, #4 // k * 2 * 4 * 2 add r3, r3, r4 // B = B + K * 2 * 8 mov BC, r3 - + #if !defined(LEFT) ldr r3 , KK add r3 , r3 , #2 // number of values in BO @@ -1188,7 +1188,7 @@ _L1_BEGIN: tst J , #1 ble _L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1286,7 +1286,7 @@ _L1_M2_22: b _L1_M2_44 - + _L1_M2_30: tst L, #3 ble _L1_M2_40 @@ -1351,7 +1351,7 @@ _L1_M2_46: subs L, L, #1 bne _L1_M2_46 - + _L1_M2_100: SAVE2x1 @@ -1439,10 +1439,10 @@ _L1_M1_22: subs L, L, #1 bgt _L1_M1_22 - + _L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M1_100 @@ -1452,7 +1452,7 @@ _L1_M1_42: subs L, L, #1 bgt _L1_M1_42 - + _L1_M1_100: SAVE1x1 diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S index 0fad3c4a6..da239924a 100644 --- a/kernel/arm/dcopy_vfp.S +++ b/kernel/arm/dcopy_vfp.S @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - + cmp N, #0 ble dcopy_kernel_L999 diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index ab819ec98..71b3c1ce8 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -151,7 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - + vsub.f64 d0 , d0 , d0 vsub.f64 d1 , d1 , d1 diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 55409a5ef..9fb881d73 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d4 , [CO1] fldd d5 , [CO1, #8 ] - + pld [ CO1, #C_PRE ] fmacd d4 , d0 , d8 fldd d6 , [CO1, #16 ] @@ -208,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d4 , [CO1] fldd d5 , [CO1, #8 ] - + fmacd d4 , d0 , d8 fmacd d5 , d0 , d9 @@ -262,7 +262,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA fldd d4 , [CO1] - + fmacd d4 , d0 , d8 fstd d4 , [CO1] @@ -319,7 +319,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d5 , [CO1, #8 ] fldd d6 , [CO1, #16 ] fldd d7 , [CO1, #24 ] - + fmacd d4 , d0 , d8 fmacd d5 , d0 , d9 fmacd d6 , d0 , d10 @@ -364,7 +364,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d4 , [CO1] fldd d5 , [CO1, #8 ] - + fmacd d4 , d0 , d8 fmacd d5 , d0 , d9 @@ -402,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA fldd d4 , [CO1] - + fmacd d4 , d0 , d8 fstd d4 , [CO1] @@ -490,10 +490,10 @@ dgemm_kernel_L2_M4_22: subs L, L, #1 bgt dgemm_kernel_L2_M4_22 - + dgemm_kernel_L2_M4_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M4_100 @@ -503,7 +503,7 @@ dgemm_kernel_L2_M4_42: subs L, L, #1 bgt dgemm_kernel_L2_M4_42 - + dgemm_kernel_L2_M4_100: SAVE4x2 @@ -545,10 +545,10 @@ dgemm_kernel_L2_M2_22: subs L, L, #1 bgt dgemm_kernel_L2_M2_22 - + dgemm_kernel_L2_M2_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M2_100 @@ -558,7 +558,7 @@ dgemm_kernel_L2_M2_42: subs L, L, #1 bgt dgemm_kernel_L2_M2_42 - + dgemm_kernel_L2_M2_100: SAVE2x2 @@ -592,10 +592,10 @@ dgemm_kernel_L2_M1_22: subs L, L, #1 bgt dgemm_kernel_L2_M1_22 - + dgemm_kernel_L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M1_100 @@ -605,7 +605,7 @@ dgemm_kernel_L2_M1_42: subs L, L, #1 bgt dgemm_kernel_L2_M1_42 - + dgemm_kernel_L2_M1_100: SAVE1x2 @@ -630,7 +630,7 @@ dgemm_kernel_L1_BEGIN: tst J , #1 ble dgemm_kernel_L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -668,10 +668,10 @@ dgemm_kernel_L1_M4_22: subs L, L, #1 bgt dgemm_kernel_L1_M4_22 - + dgemm_kernel_L1_M4_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M4_100 @@ -681,7 +681,7 @@ dgemm_kernel_L1_M4_42: subs L, L, #1 bgt dgemm_kernel_L1_M4_42 - + dgemm_kernel_L1_M4_100: SAVE4x1 @@ -723,10 +723,10 @@ dgemm_kernel_L1_M2_22: subs L, L, #1 bgt dgemm_kernel_L1_M2_22 - + dgemm_kernel_L1_M2_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M2_100 @@ -736,7 +736,7 @@ dgemm_kernel_L1_M2_42: subs L, L, #1 bgt dgemm_kernel_L1_M2_42 - + dgemm_kernel_L1_M2_100: SAVE2x1 @@ -770,10 +770,10 @@ dgemm_kernel_L1_M1_22: subs L, L, #1 bgt dgemm_kernel_L1_M1_22 - + dgemm_kernel_L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M1_100 @@ -783,7 +783,7 @@ dgemm_kernel_L1_M1_42: subs L, L, #1 bgt dgemm_kernel_L1_M1_42 - + dgemm_kernel_L1_M1_100: SAVE1x1 diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 3b6af19a3..7c1dbae8a 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -321,7 +321,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad CO1, { d8 - d11 } pld [ r4 , #C_PRE ] - + fmacd d8 , d0 , d16 fldd d12, [CO2] fmacd d9 , d0 , d17 @@ -341,7 +341,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d11, [CO1, #24 ] fldmiad r4, { d8 - d11 } - + fmacd d8 , d0 , d24 fstd d12, [CO2] fmacd d9 , d0 , d25 @@ -425,7 +425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [CO1] fldd d9 , [CO1, #8 ] - + fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 @@ -443,7 +443,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [r4 ] fldd d9 , [r4 , #8 ] - + fmacd d8 , d0 , d24 fmacd d9 , d0 , d25 @@ -582,7 +582,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d9 , [CO1, #8 ] fldd d10, [CO1, #16 ] fldd d11, [CO1, #24 ] - + fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 fmacd d10, d0 , d18 @@ -654,7 +654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [CO1] fldd d9 , [CO1, #8 ] - + fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 @@ -760,7 +760,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d9 , [CO1, #8 ] fldd d10, [CO1, #16 ] fldd d11, [CO1, #24 ] - + fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 fmacd d10, d0 , d18 @@ -811,7 +811,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d8 , [CO1] fldd d9 , [CO1, #8 ] - + fmacd d8 , d0 , d16 fmacd d9 , d0 , d17 @@ -895,7 +895,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble dgemm_kernel_L2_BEGIN dgemm_kernel_L4_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #2 // LDC * 4 @@ -1000,7 +1000,7 @@ dgemm_kernel_L4_M4_46: subs L, L, #1 bne dgemm_kernel_L4_M4_46 - + dgemm_kernel_L4_M4_100: SAVE4x4 @@ -1042,10 +1042,10 @@ dgemm_kernel_L4_M2_22: subs L, L, #1 bgt dgemm_kernel_L4_M2_22 - + dgemm_kernel_L4_M2_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L4_M2_100 @@ -1055,7 +1055,7 @@ dgemm_kernel_L4_M2_42: subs L, L, #1 bgt dgemm_kernel_L4_M2_42 - + dgemm_kernel_L4_M2_100: SAVE2x4 @@ -1089,10 +1089,10 @@ dgemm_kernel_L4_M1_22: subs L, L, #1 bgt dgemm_kernel_L4_M1_22 - + dgemm_kernel_L4_M1_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L4_M1_100 @@ -1102,7 +1102,7 @@ dgemm_kernel_L4_M1_42: subs L, L, #1 bgt dgemm_kernel_L4_M1_42 - + dgemm_kernel_L4_M1_100: SAVE1x4 @@ -1115,7 +1115,7 @@ dgemm_kernel_L4_END: lsl r4, r4, #5 // k * 4 * 8 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 - + subs J , #1 // j-- bgt dgemm_kernel_L4_BEGIN @@ -1131,7 +1131,7 @@ dgemm_kernel_L2_BEGIN: tst J , #2 ble dgemm_kernel_L1_BEGIN - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -1170,10 +1170,10 @@ dgemm_kernel_L2_M4_22: subs L, L, #1 bgt dgemm_kernel_L2_M4_22 - + dgemm_kernel_L2_M4_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M4_100 @@ -1183,7 +1183,7 @@ dgemm_kernel_L2_M4_42: subs L, L, #1 bgt dgemm_kernel_L2_M4_42 - + dgemm_kernel_L2_M4_100: SAVE4x2 @@ -1225,10 +1225,10 @@ dgemm_kernel_L2_M2_22: subs L, L, #1 bgt dgemm_kernel_L2_M2_22 - + dgemm_kernel_L2_M2_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M2_100 @@ -1238,7 +1238,7 @@ dgemm_kernel_L2_M2_42: subs L, L, #1 bgt dgemm_kernel_L2_M2_42 - + dgemm_kernel_L2_M2_100: SAVE2x2 @@ -1272,10 +1272,10 @@ dgemm_kernel_L2_M1_22: subs L, L, #1 bgt dgemm_kernel_L2_M1_22 - + dgemm_kernel_L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L2_M1_100 @@ -1285,7 +1285,7 @@ dgemm_kernel_L2_M1_42: subs L, L, #1 bgt dgemm_kernel_L2_M1_42 - + dgemm_kernel_L2_M1_100: SAVE1x2 @@ -1307,7 +1307,7 @@ dgemm_kernel_L1_BEGIN: tst J , #1 ble dgemm_kernel_L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1345,10 +1345,10 @@ dgemm_kernel_L1_M4_22: subs L, L, #1 bgt dgemm_kernel_L1_M4_22 - + dgemm_kernel_L1_M4_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M4_100 @@ -1358,7 +1358,7 @@ dgemm_kernel_L1_M4_42: subs L, L, #1 bgt dgemm_kernel_L1_M4_42 - + dgemm_kernel_L1_M4_100: SAVE4x1 @@ -1400,10 +1400,10 @@ dgemm_kernel_L1_M2_22: subs L, L, #1 bgt dgemm_kernel_L1_M2_22 - + dgemm_kernel_L1_M2_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M2_100 @@ -1413,7 +1413,7 @@ dgemm_kernel_L1_M2_42: subs L, L, #1 bgt dgemm_kernel_L1_M2_42 - + dgemm_kernel_L1_M2_100: SAVE2x1 @@ -1447,10 +1447,10 @@ dgemm_kernel_L1_M1_22: subs L, L, #1 bgt dgemm_kernel_L1_M1_22 - + dgemm_kernel_L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble dgemm_kernel_L1_M1_100 @@ -1460,7 +1460,7 @@ dgemm_kernel_L1_M1_42: subs L, L, #1 bgt dgemm_kernel_L1_M1_42 - + dgemm_kernel_L1_M1_100: SAVE1x1 diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S index 763c032e1..6266c61d2 100644 --- a/kernel/arm/dgemm_ncopy_2_vfp.S +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -127,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl LDA, OLD_LDA, #3 // lda = lda * 8 - ldr BO, B + ldr BO, B /*********************************************************************************************/ @@ -152,8 +152,8 @@ dgemm_ncopy_L2_M2_20: subs I , I , #1 bne dgemm_ncopy_L2_M2_20 - - + + dgemm_ncopy_L2_M2_40: ands I, M , #1 @@ -165,7 +165,7 @@ dgemm_ncopy_L2_M2_60: subs I , I , #1 bne dgemm_ncopy_L2_M2_60 - + dgemm_ncopy_L2_M2_END: @@ -194,8 +194,8 @@ dgemm_ncopy_L1_M2_20: subs I , I , #1 bne dgemm_ncopy_L1_M2_20 - - + + dgemm_ncopy_L1_M2_40: ands I, M , #1 @@ -207,7 +207,7 @@ dgemm_ncopy_L1_M2_60: subs I , I , #1 bne dgemm_ncopy_L1_M2_60 - + dgemm_ncopy_L1_M2_END: diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S index ad6692e50..ffc19a9cc 100644 --- a/kernel/arm/dgemm_ncopy_4_vfp.S +++ b/kernel/arm/dgemm_ncopy_4_vfp.S @@ -202,7 +202,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers - ldr BO, B + ldr BO, B dgemm_ncopy_L4_BEGIN: @@ -227,8 +227,8 @@ dgemm_ncopy_L4_M4_20: subs I , I , #1 bne dgemm_ncopy_L4_M4_20 - - + + dgemm_ncopy_L4_M4_40: ands I, M , #3 @@ -240,7 +240,7 @@ dgemm_ncopy_L4_M4_60: subs I , I , #1 bne dgemm_ncopy_L4_M4_60 - + dgemm_ncopy_L4_M4_END: @@ -275,8 +275,8 @@ dgemm_ncopy_L2_M4_20: subs I , I , #1 bne dgemm_ncopy_L2_M4_20 - - + + dgemm_ncopy_L2_M4_40: ands I, M , #3 @@ -288,7 +288,7 @@ dgemm_ncopy_L2_M4_60: subs I , I , #1 bne dgemm_ncopy_L2_M4_60 - + dgemm_ncopy_L2_M4_END: @@ -316,8 +316,8 @@ dgemm_ncopy_L1_M4_20: subs I , I , #1 bne dgemm_ncopy_L1_M4_20 - - + + dgemm_ncopy_L1_M4_40: ands I, M , #3 @@ -329,7 +329,7 @@ dgemm_ncopy_L1_M4_60: subs I , I , #1 bne dgemm_ncopy_L1_M4_60 - + dgemm_ncopy_L1_M4_END: diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S index 88a139ad8..937f43957 100644 --- a/kernel/arm/dgemm_tcopy_4_vfp.S +++ b/kernel/arm/dgemm_tcopy_4_vfp.S @@ -271,15 +271,15 @@ dgemm_tcopy_L4_M4_20: subs I , I , #1 bne dgemm_tcopy_L4_M4_20 - - + + dgemm_tcopy_L4_M4_40: tst N , #2 ble dgemm_tcopy_L4_M4_60 COPY2x4 - + dgemm_tcopy_L4_M4_60: @@ -287,7 +287,7 @@ dgemm_tcopy_L4_M4_60: ble dgemm_tcopy_L4_M4_END COPY1x4 - + dgemm_tcopy_L4_M4_END: @@ -326,8 +326,8 @@ dgemm_tcopy_L2_M4_20: subs I , I , #1 bne dgemm_tcopy_L2_M4_20 - - + + dgemm_tcopy_L2_M4_40: tst N , #2 @@ -373,8 +373,8 @@ dgemm_tcopy_L1_M4_20: subs I , I , #1 bne dgemm_tcopy_L1_M4_20 - - + + dgemm_tcopy_L1_M4_40: tst N , #2 diff --git a/kernel/arm/dot.c b/kernel/arm/dot.c index 30490e291..46a84ad18 100644 --- a/kernel/arm/dot.c +++ b/kernel/arm/dot.c @@ -60,5 +60,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) return(dot); } - + diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index 762b9c580..3528e0860 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -198,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA - + fmuld d4 , d0 , d8 fmuld d5 , d0 , d9 @@ -248,7 +248,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA - + fmuld d4 , d0 , d8 fstd d4 , [CO1] @@ -508,10 +508,10 @@ _L2_M4_22: subs L, L, #1 bgt _L2_M4_22 - + _L2_M4_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M4_100 @@ -521,7 +521,7 @@ _L2_M4_42: subs L, L, #1 bgt _L2_M4_42 - + _L2_M4_100: SAVE4x2 @@ -613,10 +613,10 @@ _L2_M2_22: subs L, L, #1 bgt _L2_M2_22 - + _L2_M2_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M2_100 @@ -626,7 +626,7 @@ _L2_M2_42: subs L, L, #1 bgt _L2_M2_42 - + _L2_M2_100: SAVE2x2 @@ -710,10 +710,10 @@ _L2_M1_22: subs L, L, #1 bgt _L2_M1_22 - + _L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M1_100 @@ -723,7 +723,7 @@ _L2_M1_42: subs L, L, #1 bgt _L2_M1_42 - + _L2_M1_100: SAVE1x2 @@ -774,7 +774,7 @@ _L1_BEGIN: tst J , #1 ble _L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -851,10 +851,10 @@ _L1_M4_22: subs L, L, #1 bgt _L1_M4_22 - + _L1_M4_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M4_100 @@ -864,7 +864,7 @@ _L1_M4_42: subs L, L, #1 bgt _L1_M4_42 - + _L1_M4_100: SAVE4x1 @@ -956,10 +956,10 @@ _L1_M2_22: subs L, L, #1 bgt _L1_M2_22 - + _L1_M2_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M2_100 @@ -969,7 +969,7 @@ _L1_M2_42: subs L, L, #1 bgt _L1_M2_42 - + _L1_M2_100: SAVE2x1 @@ -1053,10 +1053,10 @@ _L1_M1_22: subs L, L, #1 bgt _L1_M1_22 - + _L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M1_100 @@ -1066,7 +1066,7 @@ _L1_M1_42: subs L, L, #1 bgt _L1_M1_42 - + _L1_M1_100: SAVE1x1 diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index 0f8a9291a..04cc451d1 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -340,7 +340,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA add r4 , CO2, r3 - + fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fmuld d10, d0 , d18 @@ -355,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuld d15, d0 , d23 fstd d11, [CO1, #24 ] - + fmuld d8 , d0 , d24 fstd d12, [CO2] fmuld d9 , d0 , d25 @@ -432,7 +432,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add r4 , CO2, r3 fldd d0, ALPHA - + fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 @@ -444,7 +444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstd d12, [CO2] fstd d13, [CO2, #8 ] - + fmuld d8 , d0 , d24 fmuld d9 , d0 , d25 @@ -571,7 +571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add CO2 , CO1, r3 fldd d0, ALPHA - + fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fmuld d10, d0 , d18 @@ -731,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA - + fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 fmuld d10, d0 , d18 @@ -779,7 +779,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA - + fmuld d8 , d0 , d16 fmuld d9 , d0 , d17 @@ -870,7 +870,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble _L2_BEGIN _L4_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #2 // LDC * 4 @@ -1026,14 +1026,14 @@ _L4_M4_22: ble _L4_M4_41 b _L4_M4_22 - + _L4_M4_40: INIT4x4 _L4_M4_41: - + ands L , K1, #31 // L = L % 8 ble _L4_M4_100 @@ -1043,7 +1043,7 @@ _L4_M4_42: subs L, L, #1 bgt _L4_M4_42 - + _L4_M4_100: SAVE4x4 @@ -1135,10 +1135,10 @@ _L4_M2_22: subs L, L, #1 bgt _L4_M2_22 - + _L4_M2_40: - + ands L , K1, #7 // L = L % 8 ble _L4_M2_100 @@ -1148,7 +1148,7 @@ _L4_M2_42: subs L, L, #1 bgt _L4_M2_42 - + _L4_M2_100: SAVE2x4 @@ -1231,10 +1231,10 @@ _L4_M1_22: subs L, L, #1 bgt _L4_M1_22 - + _L4_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L4_M1_100 @@ -1244,7 +1244,7 @@ _L4_M1_42: subs L, L, #1 bgt _L4_M1_42 - + _L4_M1_100: SAVE1x4 @@ -1297,7 +1297,7 @@ _L2_BEGIN: tst J , #2 ble _L1_BEGIN - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -1375,10 +1375,10 @@ _L2_M4_22: subs L, L, #1 bgt _L2_M4_22 - + _L2_M4_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M4_100 @@ -1388,7 +1388,7 @@ _L2_M4_42: subs L, L, #1 bgt _L2_M4_42 - + _L2_M4_100: SAVE4x2 @@ -1480,10 +1480,10 @@ _L2_M2_22: subs L, L, #1 bgt _L2_M2_22 - + _L2_M2_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M2_100 @@ -1493,7 +1493,7 @@ _L2_M2_42: subs L, L, #1 bgt _L2_M2_42 - + _L2_M2_100: SAVE2x2 @@ -1577,10 +1577,10 @@ _L2_M1_22: subs L, L, #1 bgt _L2_M1_22 - + _L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L2_M1_100 @@ -1590,7 +1590,7 @@ _L2_M1_42: subs L, L, #1 bgt _L2_M1_42 - + _L2_M1_100: SAVE1x2 @@ -1638,7 +1638,7 @@ _L1_BEGIN: tst J , #1 ble _L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1715,10 +1715,10 @@ _L1_M4_22: subs L, L, #1 bgt _L1_M4_22 - + _L1_M4_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M4_100 @@ -1728,7 +1728,7 @@ _L1_M4_42: subs L, L, #1 bgt _L1_M4_42 - + _L1_M4_100: SAVE4x1 @@ -1820,10 +1820,10 @@ _L1_M2_22: subs L, L, #1 bgt _L1_M2_22 - + _L1_M2_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M2_100 @@ -1833,7 +1833,7 @@ _L1_M2_42: subs L, L, #1 bgt _L1_M2_42 - + _L1_M2_100: SAVE2x1 @@ -1917,10 +1917,10 @@ _L1_M1_22: subs L, L, #1 bgt _L1_M1_22 - + _L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble _L1_M1_100 @@ -1930,7 +1930,7 @@ _L1_M1_42: subs L, L, #1 bgt _L1_M1_42 - + _L1_M1_100: SAVE1x1 diff --git a/kernel/arm/gemv_n.c b/kernel/arm/gemv_n.c index a29508093..ef61b245b 100644 --- a/kernel/arm/gemv_n.c +++ b/kernel/arm/gemv_n.c @@ -63,5 +63,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } return(0); } - + diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index f1cf9a05e..505033c18 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -594,7 +594,7 @@ gemvn_kernel_F1X1: ldr AO1, A add r3, AO1, #SIZE str r3, A - + ldr XO , X INIT_F1 @@ -694,7 +694,7 @@ gemvn_kernel_S1X1: ldr AO1, A add r3, AO1, #SIZE str r3, A - + ldr XO , X INIT_S1 diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index e031c331e..0e9ba0c9c 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -635,7 +635,7 @@ gemvn_kernel_F1X1: ldr AO1, A add r3, AO1, #SIZE str r3, A - + ldr XO , X INIT_F1 @@ -735,7 +735,7 @@ gemvn_kernel_S1X1: ldr AO1, A add r3, AO1, #SIZE str r3, A - + ldr XO , X INIT_S1 diff --git a/kernel/arm/gemv_t.c b/kernel/arm/gemv_t.c index f94db4085..169047b72 100644 --- a/kernel/arm/gemv_t.c +++ b/kernel/arm/gemv_t.c @@ -64,5 +64,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } - + diff --git a/kernel/arm/iamax.c b/kernel/arm/iamax.c index 3b7fe1cb1..d211776e9 100644 --- a/kernel/arm/iamax.c +++ b/kernel/arm/iamax.c @@ -61,7 +61,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( ABS(x[ix]) > ABS(maxf) ) + if( ABS(x[ix]) > ABS(maxf) ) { max = i; maxf = ABS(x[ix]); @@ -71,5 +71,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(max+1); } - + diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index 1d7344898..f50c28e42 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -354,7 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, #0 beq iamax_kernel_L999 - + cmp INC_X, #1 bne iamax_kernel_S_BEGIN diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c index fdb5d7a10..7efce19b1 100644 --- a/kernel/arm/iamin.c +++ b/kernel/arm/iamin.c @@ -61,7 +61,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( ABS(x[ix]) < ABS(minf) ) + if( ABS(x[ix]) < ABS(minf) ) { min = i; minf = ABS(x[ix]); @@ -71,5 +71,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(min+1); } - + diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c index e3e4b9a6c..28022f67b 100644 --- a/kernel/arm/imax.c +++ b/kernel/arm/imax.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > maxf ) + if( x[ix] > maxf ) { max = i; maxf = x[ix]; @@ -63,5 +63,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(max+1); } - + diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index fbcadc2fd..fe8aa962a 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -28,8 +28,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************************** * 2013/08/19 Saar -* BLASTEST float -* BLASTEST double +* BLASTEST float +* BLASTEST double * **************************************************************************************/ @@ -51,7 +51,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] > minf ) { min = i; minf = x[ix]; @@ -61,5 +61,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(min+1); } - + diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c index a6ba86388..54bb35149 100644 --- a/kernel/arm/izamax.c +++ b/kernel/arm/izamax.c @@ -66,7 +66,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( CABS1(x,ix) > CABS1(maxf,0) ) + if( CABS1(x,ix) > CABS1(maxf,0) ) { max = i; maxf[0] = ABS(x[ix]); @@ -77,5 +77,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(max+1); } - + diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c index 45c2a7c9c..448b3cbfc 100644 --- a/kernel/arm/izamin.c +++ b/kernel/arm/izamin.c @@ -66,7 +66,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( CABS1(x,ix) < CABS1(minf,0) ) + if( CABS1(x,ix) < CABS1(minf,0) ) { min = i; minf[0] = ABS(x[ix]); @@ -77,5 +77,5 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(min+1); } - + diff --git a/kernel/arm/max.c b/kernel/arm/max.c index 3239e3408..04529dbd6 100644 --- a/kernel/arm/max.c +++ b/kernel/arm/max.c @@ -50,7 +50,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > maxf ) + if( x[ix] > maxf ) { maxf = x[ix]; } @@ -59,5 +59,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(maxf); } - + diff --git a/kernel/arm/min.c b/kernel/arm/min.c index de4c4719a..63c704c79 100644 --- a/kernel/arm/min.c +++ b/kernel/arm/min.c @@ -50,7 +50,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] < minf ) + if( x[ix] < minf ) { minf = x[ix]; } @@ -59,5 +59,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(minf); } - + diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c index d65c5a410..b4d810d53 100644 --- a/kernel/arm/nrm2.c +++ b/kernel/arm/nrm2.c @@ -63,7 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) n *= inc_x; while(i < n) { - + if ( x[i] != 0.0 ) { absxi = ABS( x[i] ); @@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) else { ssq += ( absxi/scale ) * ( absxi/scale ); - } + } } i += inc_x; @@ -84,5 +84,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) return(scale); } - + diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index 4c62917b9..d80179a11 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f64 d4, d4 + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -98,8 +98,8 @@ KERNEL_F1_NEXT_\@: fldmiad X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT - vabs.f64 d4, d4 + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -124,8 +124,8 @@ KERNEL_S1_NEXT: fldmias X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f32 s4, s4 + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -161,8 +161,8 @@ KERNEL_F1_NEXT_\@: fldmias X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT - vabs.f32 s4, s4 + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -195,8 +195,8 @@ KERNEL_S1_NEXT: vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f64 d4, d4 + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -212,8 +212,8 @@ KERNEL_F1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_END_\@ - vabs.f64 d5, d5 + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale @@ -253,8 +253,8 @@ KERNEL_F1_END_\@: vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT_\@ - vabs.f64 d4, d4 + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -270,8 +270,8 @@ KERNEL_S1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_END_\@ - vabs.f64 d5, d5 + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale @@ -298,8 +298,8 @@ KERNEL_S1_END_\@: vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f32 s4, s4 + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -315,8 +315,8 @@ KERNEL_F1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_END_\@ - vabs.f32 s5, s5 + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale @@ -354,8 +354,8 @@ KERNEL_F1_END_\@: vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT_\@ - vabs.f32 s4, s4 + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -371,8 +371,8 @@ KERNEL_S1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_END_\@ - vabs.f32 s5, s5 + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale @@ -448,13 +448,13 @@ nrm2_begin: #if defined(DOUBLE) vsub.f64 d0 , d0 , d0 // scale=0.0 vldr.64 d1 , znrm2_one // ssq=1.0 - vmov.f64 d7 , d1 // value 1.0 - vmov.f64 d6 , d0 // value 0.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 #else vsub.f32 s0 , s0 , s0 // scale=0.0 vldr.32 s1 , cnrm2_one // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 - vmov.f32 s6 , s0 // value 0.0 + vmov.f32 s6 , s0 // value 0.0 #endif #else @@ -462,13 +462,13 @@ nrm2_begin: #if defined(DOUBLE) vsub.f64 d0 , d0 , d0 // scale=0.0 vldr.64 d1 , dnrm2_one // ssq=1.0 - vmov.f64 d7 , d1 // value 1.0 - vmov.f64 d6 , d0 // value 0.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 #else vsub.f32 s0 , s0 , s0 // scale=0.0 vldr.32 s1 , snrm2_one // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 - vmov.f32 s6 , s0 // value 0.0 + vmov.f32 s6 , s0 // value 0.0 #endif @@ -481,7 +481,7 @@ nrm2_begin: cmp INC_X, #0 beq nrm2_kernel_L999 - + cmp INC_X, #1 bne nrm2_kernel_S_BEGIN diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index b56f8b038..34b251e9a 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X!, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f64 d4, d4 + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -98,8 +98,8 @@ KERNEL_F1_NEXT_\@: fldmiad X, { d4 } vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT - vabs.f64 d4, d4 + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -124,8 +124,8 @@ KERNEL_S1_NEXT: fldmias X!, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f32 s4, s4 + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -161,8 +161,8 @@ KERNEL_F1_NEXT_\@: fldmias X, { s4 } vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT - vabs.f32 s4, s4 + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -195,8 +195,8 @@ KERNEL_S1_NEXT: vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f64 d4, d4 + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -212,8 +212,8 @@ KERNEL_F1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_END_\@ - vabs.f64 d5, d5 + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale @@ -253,8 +253,8 @@ KERNEL_F1_END_\@: vcmpe.f64 d4, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT_\@ - vabs.f64 d4, d4 + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 vcmpe.f64 d0, d4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale @@ -270,8 +270,8 @@ KERNEL_S1_NEXT_\@: vcmpe.f64 d5, d6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_END_\@ - vabs.f64 d5, d5 + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 vcmpe.f64 d0, d5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale @@ -298,8 +298,8 @@ KERNEL_S1_END_\@: vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_NEXT_\@ - vabs.f32 s4, s4 + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -315,8 +315,8 @@ KERNEL_F1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_F1_END_\@ - vabs.f32 s5, s5 + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale @@ -354,8 +354,8 @@ KERNEL_F1_END_\@: vcmpe.f32 s4, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_NEXT_\@ - vabs.f32 s4, s4 + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 vcmpe.f32 s0, s4 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale @@ -371,8 +371,8 @@ KERNEL_S1_NEXT_\@: vcmpe.f32 s5, s6 // compare with 0.0 vmrs APSR_nzcv, fpscr - beq KERNEL_S1_END_\@ - vabs.f32 s5, s5 + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 vcmpe.f32 s0, s5 // compare with scale vmrs APSR_nzcv, fpscr vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale @@ -407,13 +407,13 @@ KERNEL_S1_END_\@: #if defined(DOUBLE) vsub.f64 d0 , d0 , d0 // scale=0.0 vmov.f64 d1 , #1.0 // ssq=1.0 - vmov.f64 d7 , d1 // value 1.0 - vmov.f64 d6 , d0 // value 0.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 #else vsub.f32 s0 , s0 , s0 // scale=0.0 vmov.f32 s1 , #1.0 // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 - vmov.f32 s6 , s0 // value 0.0 + vmov.f32 s6 , s0 // value 0.0 #endif @@ -424,7 +424,7 @@ KERNEL_S1_END_\@: cmp INC_X, #0 beq nrm2_kernel_L999 - + cmp INC_X, #1 bne nrm2_kernel_S_BEGIN diff --git a/kernel/arm/omatcopy_cn.c b/kernel/arm/omatcopy_cn.c index e46ddaede..4d11b9125 100644 --- a/kernel/arm/omatcopy_cn.c +++ b/kernel/arm/omatcopy_cn.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLO if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); - + aptr = a; bptr = b; @@ -52,10 +52,10 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLO { for(j=0; j CABS1(maxf,0) ) + if( CABS1(x,ix) > CABS1(maxf,0) ) { max = i; maxf[0] = ABS(x[ix]); @@ -77,5 +77,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(CABS1(maxf,0)); } - + diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c index 6956ced0e..9e26a66d0 100644 --- a/kernel/arm/zamin.c +++ b/kernel/arm/zamin.c @@ -66,7 +66,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( CABS1(x,ix) < CABS1(minf,0) ) + if( CABS1(x,ix) < CABS1(minf,0) ) { min = i; minf[0] = ABS(x[ix]); @@ -77,5 +77,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(CABS1(minf,0)); } - + diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c index 13acfc0f0..0c5d69e35 100644 --- a/kernel/arm/zasum.c +++ b/kernel/arm/zasum.c @@ -67,5 +67,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return(sumf); } - + diff --git a/kernel/arm/zaxpby.c b/kernel/arm/zaxpby.c index 873273f2d..2e0c2940d 100644 --- a/kernel/arm/zaxpby.c +++ b/kernel/arm/zaxpby.c @@ -113,5 +113,5 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL return(0); } - + diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c index 28a4380fb..929ee8b54 100644 --- a/kernel/arm/zaxpy.c +++ b/kernel/arm/zaxpy.c @@ -68,5 +68,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return(0); } - + diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c index 654711240..f720d6ee5 100644 --- a/kernel/arm/zcopy.c +++ b/kernel/arm/zcopy.c @@ -59,5 +59,5 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) return(0); } - + diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S index 06f892446..48aee4ce0 100644 --- a/kernel/arm/zcopy_vfp.S +++ b/kernel/arm/zcopy_vfp.S @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - + cmp N, #0 ble zcopy_kernel_L999 diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 096ced9db..469487531 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -43,7 +43,7 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in BLASLONG ix=0,iy=0; FLOAT dot[2]; FLOAT _Complex result; - + dot[0]=0.0; dot[1]=0.0; @@ -74,5 +74,5 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in return(result); } - + diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 1a78b5aec..622169bb9 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -189,7 +189,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - + vsub.f64 d0 , d0 , d0 vsub.f64 d1 , d1 , d1 vsub.f64 d2 , d2 , d2 @@ -271,11 +271,11 @@ zdot_kernel_L999: vldm r3, { d8 - d15} // restore floating point registers #if !defined(CONJ) - vsub.f64 d0 , d0, d2 - vadd.f64 d1 , d1, d3 + vsub.f64 d0 , d0, d2 + vadd.f64 d1 , d1, d3 #else - vadd.f64 d0 , d0, d2 - vsub.f64 d1 , d1, d3 + vadd.f64 d0 , d0, d2 + vsub.f64 d1 , d1, d3 #endif sub sp, fp, #24 diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 8a5401858..f4134eaf6 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -81,7 +81,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B_PRE 96 #define C_PRE 64 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define KMAC_R fnmacd #define KMAC_I fmacd @@ -881,7 +881,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble zgemm_kernel_L1_BEGIN zgemm_kernel_L2_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -950,7 +950,7 @@ zgemm_kernel_L2_M2_22: b zgemm_kernel_L2_M2_44 - + zgemm_kernel_L2_M2_30: tst L, #3 ble zgemm_kernel_L2_M2_40 @@ -1015,7 +1015,7 @@ zgemm_kernel_L2_M2_46: subs L, L, #1 bne zgemm_kernel_L2_M2_46 - + zgemm_kernel_L2_M2_100: SAVE2x2 @@ -1054,10 +1054,10 @@ zgemm_kernel_L2_M1_22: subs L, L, #1 bgt zgemm_kernel_L2_M1_22 - + zgemm_kernel_L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L2_M1_100 @@ -1067,7 +1067,7 @@ zgemm_kernel_L2_M1_42: subs L, L, #1 bgt zgemm_kernel_L2_M1_42 - + zgemm_kernel_L2_M1_100: SAVE1x2 @@ -1080,7 +1080,7 @@ zgemm_kernel_L2_END: lsl r4, r4, #5 // k * 2 * 8 * 2 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 - + subs J , #1 // j-- bgt zgemm_kernel_L2_BEGIN @@ -1094,7 +1094,7 @@ zgemm_kernel_L1_BEGIN: tst J , #1 ble zgemm_kernel_L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1158,7 +1158,7 @@ zgemm_kernel_L1_M2_22: b zgemm_kernel_L1_M2_44 - + zgemm_kernel_L1_M2_30: tst L, #3 ble zgemm_kernel_L1_M2_40 @@ -1223,7 +1223,7 @@ zgemm_kernel_L1_M2_46: subs L, L, #1 bne zgemm_kernel_L1_M2_46 - + zgemm_kernel_L1_M2_100: SAVE2x1 @@ -1262,10 +1262,10 @@ zgemm_kernel_L1_M1_22: subs L, L, #1 bgt zgemm_kernel_L1_M1_22 - + zgemm_kernel_L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L1_M1_100 @@ -1275,7 +1275,7 @@ zgemm_kernel_L1_M1_42: subs L, L, #1 bgt zgemm_kernel_L1_M1_42 - + zgemm_kernel_L1_M1_100: SAVE1x1 diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 2d35028a2..29c3f4582 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B_PRE 96 #define C_PRE 64 -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubd #define FADD_I faddd @@ -927,7 +927,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble zgemm_kernel_L1_BEGIN zgemm_kernel_L2_BEGIN: - + ldr CO1, C // CO1 = C ldr r4 , LDC lsl r4 , r4 , #1 // LDC * 2 @@ -996,7 +996,7 @@ zgemm_kernel_L2_M2_22: b zgemm_kernel_L2_M2_44 - + zgemm_kernel_L2_M2_30: tst L, #3 ble zgemm_kernel_L2_M2_40 @@ -1061,7 +1061,7 @@ zgemm_kernel_L2_M2_46: subs L, L, #1 bne zgemm_kernel_L2_M2_46 - + zgemm_kernel_L2_M2_100: SAVE2x2 @@ -1100,10 +1100,10 @@ zgemm_kernel_L2_M1_22: subs L, L, #1 bgt zgemm_kernel_L2_M1_22 - + zgemm_kernel_L2_M1_40: - + ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L2_M1_100 @@ -1113,7 +1113,7 @@ zgemm_kernel_L2_M1_42: subs L, L, #1 bgt zgemm_kernel_L2_M1_42 - + zgemm_kernel_L2_M1_100: SAVE1x2 @@ -1126,7 +1126,7 @@ zgemm_kernel_L2_END: lsl r4, r4, #5 // k * 2 * 8 * 2 add r3, r3, r4 // B = B + K * 4 * 8 mov BC, r3 - + subs J , #1 // j-- bgt zgemm_kernel_L2_BEGIN @@ -1140,7 +1140,7 @@ zgemm_kernel_L1_BEGIN: tst J , #1 ble zgemm_kernel_L999 - + ldr CO1, C // CO1 = C ldr r4 , LDC add r3 , r4, CO1 @@ -1204,7 +1204,7 @@ zgemm_kernel_L1_M2_22: b zgemm_kernel_L1_M2_44 - + zgemm_kernel_L1_M2_30: tst L, #3 ble zgemm_kernel_L1_M2_40 @@ -1269,7 +1269,7 @@ zgemm_kernel_L1_M2_46: subs L, L, #1 bne zgemm_kernel_L1_M2_46 - + zgemm_kernel_L1_M2_100: SAVE2x1 @@ -1308,10 +1308,10 @@ zgemm_kernel_L1_M1_22: subs L, L, #1 bgt zgemm_kernel_L1_M1_22 - + zgemm_kernel_L1_M1_40: - + ands L , K1, #7 // L = L % 8 ble zgemm_kernel_L1_M1_100 @@ -1321,7 +1321,7 @@ zgemm_kernel_L1_M1_42: subs L, L, #1 bgt zgemm_kernel_L1_M1_42 - + zgemm_kernel_L1_M1_100: SAVE1x1 diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S index 5ff8ee299..b3fa225bb 100644 --- a/kernel/arm/zgemm_ncopy_2_vfp.S +++ b/kernel/arm/zgemm_ncopy_2_vfp.S @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers - ldr BO, B + ldr BO, B /*********************************************************************************************/ @@ -177,8 +177,8 @@ zgemm_ncopy_L2_M2_20: subs I , I , #1 bne zgemm_ncopy_L2_M2_20 - - + + zgemm_ncopy_L2_M2_40: ands I, M , #1 @@ -190,7 +190,7 @@ zgemm_ncopy_L2_M2_60: subs I , I , #1 bne zgemm_ncopy_L2_M2_60 - + zgemm_ncopy_L2_M2_END: @@ -221,8 +221,8 @@ zgemm_ncopy_L1_M2_20: subs I , I , #1 bne zgemm_ncopy_L1_M2_20 - - + + zgemm_ncopy_L1_M2_40: ands I, M , #1 @@ -234,7 +234,7 @@ zgemm_ncopy_L1_M2_60: subs I , I , #1 bne zgemm_ncopy_L1_M2_60 - + zgemm_ncopy_L1_M2_END: diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c index dc2ffa0d2..b9b03f792 100644 --- a/kernel/arm/zgemv_n.c +++ b/kernel/arm/zgemv_n.c @@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return(0); } - + inc_x2 = 2 * inc_x; inc_y2 = 2 * inc_y; @@ -153,5 +153,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return(0); } - + diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index 3b51d5553..d4cab090a 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -553,7 +553,7 @@ zgemvn_kernel_F1X1: ldr AO1, A add r3, AO1, #16 str r3, A - + ldr XO , X INIT_F1 @@ -653,7 +653,7 @@ zgemvn_kernel_S1X1: ldr AO1, A add r3, AO1, #16 str r3, A - + ldr XO , X INIT_S1 diff --git a/kernel/arm/zgemv_t.c b/kernel/arm/zgemv_t.c index 6161dbaf2..1239cf3f7 100644 --- a/kernel/arm/zgemv_t.c +++ b/kernel/arm/zgemv_t.c @@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, a_ptr += lda2; iy += 2; - } + } return(0); diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c index d68e3021f..c590095e7 100644 --- a/kernel/arm/znrm2.c +++ b/kernel/arm/znrm2.c @@ -64,7 +64,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) n *= inc_x2; while(i < n) { - + if ( x[i] != 0.0 ) { temp = ABS( x[i] ); @@ -76,10 +76,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) else { ssq += ( temp / scale ) * ( temp / scale ); - } + } } - + if ( x[i+1] != 0.0 ) { temp = ABS( x[i+1] ); @@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) else { ssq += ( temp / scale ) * ( temp / scale ); - } + } } @@ -102,5 +102,5 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) return(scale); } - + diff --git a/kernel/arm/zomatcopy_cn.c b/kernel/arm/zomatcopy_cn.c index 28bbb6127..f5a7a6284 100644 --- a/kernel/arm/zomatcopy_cn.c +++ b/kernel/arm/zomatcopy_cn.c @@ -42,7 +42,7 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); - + aptr = a; bptr = b; @@ -56,15 +56,15 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, for(j=0; j> 3); if (i > 0){ do { diff --git a/kernel/generic/gemm_ncopy_1.c b/kernel/generic/gemm_ncopy_1.c index e990de771..ac99037b9 100644 --- a/kernel/generic/gemm_ncopy_1.c +++ b/kernel/generic/gemm_ncopy_1.c @@ -55,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset += lda; i = (m >> 3); - + if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ } i = (m & 7); - + if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c index 4a9269ec1..9bd40f121 100644 --- a/kernel/generic/gemm_ncopy_16.c +++ b/kernel/generic/gemm_ncopy_16.c @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset = a; boffset = b; - + j = (n >> 4); if (j > 0){ do{ @@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset15 = aoffset14 + lda; aoffset16 = aoffset15 + lda; aoffset += 16 * lda; - + i = (m >> 1); if (i > 0){ do{ @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp10 = *(aoffset5 + 1); ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); - + ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); ctemp15 = *(aoffset8 + 0); @@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp18 = *(aoffset9 + 1); ctemp19 = *(aoffset10 + 0); ctemp20 = *(aoffset10 + 1); - + ctemp21 = *(aoffset11 + 0); ctemp22 = *(aoffset11 + 1); ctemp23 = *(aoffset12 + 0); @@ -119,7 +119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp26 = *(aoffset13 + 1); ctemp27 = *(aoffset14 + 0); ctemp28 = *(aoffset14 + 1); - + ctemp29 = *(aoffset15 + 0); ctemp30 = *(aoffset15 + 1); ctemp31 = *(aoffset16 + 0); @@ -133,7 +133,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; - + *(boffset + 8) = ctemp17; *(boffset + 9) = ctemp19; *(boffset + 10) = ctemp21; @@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp27; *(boffset + 14) = ctemp29; *(boffset + 15) = ctemp31; - + *(boffset + 16) = ctemp02; *(boffset + 17) = ctemp04; *(boffset + 18) = ctemp06; @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp11 = *(aoffset6 + 0); ctemp13 = *(aoffset7 + 0); ctemp15 = *(aoffset8 + 0); - + ctemp17 = *(aoffset9 + 0); ctemp19 = *(aoffset10 + 0); ctemp21 = *(aoffset11 + 0); @@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp27 = *(aoffset14 + 0); ctemp29 = *(aoffset15 + 0); ctemp31 = *(aoffset16 + 0); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; @@ -211,7 +211,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; - + *(boffset + 8) = ctemp17; *(boffset + 9) = ctemp19; *(boffset + 10) = ctemp21; @@ -220,13 +220,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp27; *(boffset + 14) = ctemp29; *(boffset + 15) = ctemp31; - + boffset += 16; } j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 8){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; @@ -237,7 +237,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; - + i = (m >> 1); if (i > 0){ do{ @@ -245,22 +245,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); - + ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); - + ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; @@ -269,7 +269,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; - + *(boffset + 8) = ctemp02; *(boffset + 9) = ctemp04; *(boffset + 10) = ctemp06; @@ -278,7 +278,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp12; *(boffset + 14) = ctemp14; *(boffset + 15) = ctemp16; - + aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; @@ -287,13 +287,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset6 += 2; aoffset7 += 2; aoffset8 += 2; - + boffset += 16; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp03 = *(aoffset2 + 0); @@ -303,7 +303,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp11 = *(aoffset6 + 0); ctemp13 = *(aoffset7 + 0); ctemp15 = *(aoffset8 + 0); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; @@ -312,7 +312,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp11; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp15; - + boffset += 8; } } @@ -323,7 +323,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; - + i = (m >> 1); if (i > 0){ do{ @@ -331,12 +331,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; @@ -345,23 +345,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp04; *(boffset + 6) = ctemp06; *(boffset + 7) = ctemp08; - + aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; boffset += 8; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp03 = *(aoffset2 + 0); ctemp05 = *(aoffset3 + 0); ctemp07 = *(aoffset4 + 0); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp05; @@ -374,7 +374,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + i = (m >> 1); if (i > 0){ do{ @@ -382,7 +382,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp02; @@ -391,15 +391,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 += 2; aoffset2 += 2; boffset += 4; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp03 = *(aoffset2 + 0); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; boffset += 2; @@ -408,26 +408,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1){ aoffset1 = aoffset; - + i = (m >> 1); if (i > 0){ do{ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; aoffset1 += 2; boffset += 2; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); - + *(boffset + 0) = ctemp01; boffset += 1; } diff --git a/kernel/generic/gemm_ncopy_2.c b/kernel/generic/gemm_ncopy_2.c index 0ec807cc4..b728c713f 100644 --- a/kernel/generic/gemm_ncopy_2.c +++ b/kernel/generic/gemm_ncopy_2.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset += 2 * lda; i = (m >> 2); - + if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); @@ -75,7 +75,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ } i = (m & 3); - + if (i > 0){ do { *(b_offset + 0) = *(a_offset1 + 0); @@ -108,9 +108,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ i --; } while (i > 0); } - + i = (m & 7); - + if (i > 0){ do { *(b_offset + 0) = *(a_offset + 0); diff --git a/kernel/generic/gemm_ncopy_4.c b/kernel/generic/gemm_ncopy_4.c index 1ecb93c65..1551b03fc 100644 --- a/kernel/generic/gemm_ncopy_4.c +++ b/kernel/generic/gemm_ncopy_4.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset = a; b_offset = b; - + j = (n >> 2); if (j > 0){ do{ @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + i = (m >> 2); if (i > 0){ do{ @@ -68,47 +68,47 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); - + ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); ctemp16 = *(a_offset4 + 3); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; - + *(b_offset + 4) = ctemp2; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp10; *(b_offset + 7) = ctemp14; - + *(b_offset + 8) = ctemp3; *(b_offset + 9) = ctemp7; *(b_offset + 10) = ctemp11; *(b_offset + 11) = ctemp15; - + *(b_offset + 12) = ctemp4; *(b_offset + 13) = ctemp8; *(b_offset + 14) = ctemp12; *(b_offset + 15) = ctemp16; - + a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; - + b_offset += 16; i --; }while(i > 0); @@ -121,17 +121,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp5 = *(a_offset2 + 0); ctemp9 = *(a_offset3 + 0); ctemp13 = *(a_offset4 + 0); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; - + a_offset1 ++; a_offset2 ++; a_offset3 ++; a_offset4 ++; - + b_offset += 4; i --; }while(i > 0); @@ -139,12 +139,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + i = (m >> 2); if (i > 0){ do{ @@ -152,38 +152,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp2; *(b_offset + 3) = ctemp6; - + *(b_offset + 4) = ctemp3; *(b_offset + 5) = ctemp7; *(b_offset + 6) = ctemp4; *(b_offset + 7) = ctemp8; - + a_offset1 += 4; a_offset2 += 4; b_offset += 8; i --; }while(i > 0); } - + i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp5 = *(a_offset2 + 0); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; - + a_offset1 ++; a_offset2 ++; b_offset += 2; @@ -191,10 +191,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ }while(i > 0); } } /* end of if(j > 0) */ - + if (n & 1){ a_offset1 = a_offset; - + i = (m >> 2); if (i > 0){ do{ @@ -202,18 +202,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; - + a_offset1 += 4; b_offset += 4; i --; }while(i > 0); } - + i = (m & 3); if (i > 0){ do{ diff --git a/kernel/generic/gemm_ncopy_6.c b/kernel/generic/gemm_ncopy_6.c index 1ecb93c65..1551b03fc 100644 --- a/kernel/generic/gemm_ncopy_6.c +++ b/kernel/generic/gemm_ncopy_6.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset = a; b_offset = b; - + j = (n >> 2); if (j > 0){ do{ @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + i = (m >> 2); if (i > 0){ do{ @@ -68,47 +68,47 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); - + ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); ctemp16 = *(a_offset4 + 3); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; - + *(b_offset + 4) = ctemp2; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp10; *(b_offset + 7) = ctemp14; - + *(b_offset + 8) = ctemp3; *(b_offset + 9) = ctemp7; *(b_offset + 10) = ctemp11; *(b_offset + 11) = ctemp15; - + *(b_offset + 12) = ctemp4; *(b_offset + 13) = ctemp8; *(b_offset + 14) = ctemp12; *(b_offset + 15) = ctemp16; - + a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; - + b_offset += 16; i --; }while(i > 0); @@ -121,17 +121,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp5 = *(a_offset2 + 0); ctemp9 = *(a_offset3 + 0); ctemp13 = *(a_offset4 + 0); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp9; *(b_offset + 3) = ctemp13; - + a_offset1 ++; a_offset2 ++; a_offset3 ++; a_offset4 ++; - + b_offset += 4; i --; }while(i > 0); @@ -139,12 +139,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 2){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + i = (m >> 2); if (i > 0){ do{ @@ -152,38 +152,38 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; *(b_offset + 2) = ctemp2; *(b_offset + 3) = ctemp6; - + *(b_offset + 4) = ctemp3; *(b_offset + 5) = ctemp7; *(b_offset + 6) = ctemp4; *(b_offset + 7) = ctemp8; - + a_offset1 += 4; a_offset2 += 4; b_offset += 8; i --; }while(i > 0); } - + i = (m & 3); if (i > 0){ do{ ctemp1 = *(a_offset1 + 0); ctemp5 = *(a_offset2 + 0); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp5; - + a_offset1 ++; a_offset2 ++; b_offset += 2; @@ -191,10 +191,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ }while(i > 0); } } /* end of if(j > 0) */ - + if (n & 1){ a_offset1 = a_offset; - + i = (m >> 2); if (i > 0){ do{ @@ -202,18 +202,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; - + a_offset1 += 4; b_offset += 4; i --; }while(i > 0); } - + i = (m & 3); if (i > 0){ do{ diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c index bdaaba113..a49a778e6 100644 --- a/kernel/generic/gemm_ncopy_8.c +++ b/kernel/generic/gemm_ncopy_8.c @@ -67,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset = a; boffset = b; - + j = (n >> 3); if (j > 0){ do{ @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; - + i = (m >> 3); if (i > 0){ do{ @@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -110,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -119,7 +119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp30 = *(aoffset4 + 5); ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); - + ctemp33 = *(aoffset5 + 0); ctemp34 = *(aoffset5 + 1); ctemp35 = *(aoffset5 + 2); @@ -128,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp38 = *(aoffset5 + 5); ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); - + ctemp41 = *(aoffset6 + 0); ctemp42 = *(aoffset6 + 1); ctemp43 = *(aoffset6 + 2); @@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp46 = *(aoffset6 + 5); ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); - + ctemp49 = *(aoffset7 + 0); ctemp50 = *(aoffset7 + 1); ctemp51 = *(aoffset7 + 2); @@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp54 = *(aoffset7 + 5); ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); - + ctemp57 = *(aoffset8 + 0); ctemp58 = *(aoffset8 + 1); ctemp59 = *(aoffset8 + 2); @@ -155,7 +155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp62 = *(aoffset8 + 5); ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp09; *(boffset + 2) = ctemp17; @@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp41; *(boffset + 6) = ctemp49; *(boffset + 7) = ctemp57; - + *(boffset + 8) = ctemp02; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp18; @@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp42; *(boffset + 14) = ctemp50; *(boffset + 15) = ctemp58; - + *(boffset + 16) = ctemp03; *(boffset + 17) = ctemp11; *(boffset + 18) = ctemp19; @@ -252,7 +252,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp41 = *(aoffset6 + 0); ctemp49 = *(aoffset7 + 0); ctemp57 = *(aoffset8 + 0); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp09; *(boffset + 2) = ctemp17; @@ -270,7 +270,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset6 ++; aoffset7 ++; aoffset8 ++; - + boffset += 8; i --; }while(i > 0); @@ -278,14 +278,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; - + i = (m >> 2); if (i > 0){ do{ @@ -293,42 +293,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp05; *(boffset + 2) = ctemp09; *(boffset + 3) = ctemp13; - + *(boffset + 4) = ctemp02; *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp10; *(boffset + 7) = ctemp14; - + *(boffset + 8) = ctemp03; *(boffset + 9) = ctemp07; *(boffset + 10) = ctemp11; *(boffset + 11) = ctemp15; - + *(boffset + 12) = ctemp04; *(boffset + 13) = ctemp08; *(boffset + 14) = ctemp12; *(boffset + 15) = ctemp16; - + aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; @@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ i --; }while(i > 0); } - + i = (m & 3); if (i > 0){ do{ @@ -345,7 +345,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset2 + 0); ctemp03 = *(aoffset3 + 0); ctemp04 = *(aoffset4 + 0); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -366,7 +366,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + i = (m >> 1); if (i > 0){ do{ @@ -374,26 +374,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp03; *(boffset + 2) = ctemp02; *(boffset + 3) = ctemp04; - + aoffset1 += 2; aoffset2 += 2; boffset += 4; i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset2 + 0); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; - + aoffset1 ++; aoffset2 ++; boffset += 2; @@ -402,7 +402,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1){ aoffset1 = aoffset; - + i = m; if (i > 0){ do{ @@ -415,7 +415,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ i --; }while(i > 0); } - + } /* end of if(j > 0) */ return 0; diff --git a/kernel/generic/gemm_tcopy_1.c b/kernel/generic/gemm_tcopy_1.c index c0c8bd023..d0018bf13 100644 --- a/kernel/generic/gemm_tcopy_1.c +++ b/kernel/generic/gemm_tcopy_1.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ b_offset1 = b_offset; b_offset ++; - + j = n; if (j > 0) { do { diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c index e5732250f..6528d9489 100644 --- a/kernel/generic/gemm_tcopy_16.c +++ b/kernel/generic/gemm_tcopy_16.c @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 16; - + i = (m >> 1); if (i > 0){ do{ @@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; @@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; - + *(boffset + 16) = ctemp17; *(boffset + 17) = ctemp18; *(boffset + 18) = ctemp19; @@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; @@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; - + i = (m >> 1); if (i > 0){ do{ @@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -228,7 +228,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; @@ -237,15 +237,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + boffset += 8; } } @@ -273,7 +273,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; - + i = (m >> 1); if (i > 0){ do{ @@ -295,15 +295,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; - + boffset += 4; } } @@ -323,7 +323,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; - + i = (m >> 1); if (i > 0){ do{ @@ -336,15 +336,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -358,7 +358,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1){ aoffset1 = aoffset; aoffset2 = aoffset + lda; - + i = (m >> 1); if (i > 0){ do{ @@ -371,11 +371,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 2; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); *(boffset + 0) = ctemp01; diff --git a/kernel/generic/gemm_tcopy_2.c b/kernel/generic/gemm_tcopy_2.c index 0aa9c2e53..5695b13c2 100644 --- a/kernel/generic/gemm_tcopy_2.c +++ b/kernel/generic/gemm_tcopy_2.c @@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ b_offset1 = b_offset; b_offset += 4; - + j = (n >> 1); if (j > 0){ do { @@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ j--; } while (j > 0); } - + if (n & 1){ *(b_offset2 + 0) = *(a_offset + 0); } diff --git a/kernel/generic/gemm_tcopy_4.c b/kernel/generic/gemm_tcopy_4.c index bd32090e7..df4c22122 100644 --- a/kernel/generic/gemm_tcopy_4.c +++ b/kernel/generic/gemm_tcopy_4.c @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + b_offset1 = b_offset; b_offset += 16; @@ -75,17 +75,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); - + ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); @@ -95,27 +95,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; - + *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; *(b_offset1 + 7) = ctemp8; - + *(b_offset1 + 8) = ctemp9; *(b_offset1 + 9) = ctemp10; *(b_offset1 + 10) = ctemp11; *(b_offset1 + 11) = ctemp12; - + *(b_offset1 + 12) = ctemp13; *(b_offset1 + 13) = ctemp14; *(b_offset1 + 14) = ctemp15; *(b_offset1 + 15) = ctemp16; - + b_offset1 += m * 4; i --; }while(i > 0); @@ -127,28 +127,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + ctemp5 = *(a_offset3 + 0); ctemp6 = *(a_offset3 + 1); - + ctemp7 = *(a_offset4 + 0); ctemp8 = *(a_offset4 + 1); - + a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; - + *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; - + *(b_offset2 + 4) = ctemp5; *(b_offset2 + 5) = ctemp6; *(b_offset2 + 6) = ctemp7; *(b_offset2 + 7) = ctemp8; - + b_offset2 += 8; } @@ -157,12 +157,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset2 + 0); ctemp3 = *(a_offset3 + 0); ctemp4 = *(a_offset4 + 0); - + *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; *(b_offset3 + 2) = ctemp3; *(b_offset3 + 3) = ctemp4; - + b_offset3 += 4; } @@ -174,10 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + b_offset1 = b_offset; b_offset += 8; - + i = (n >> 2); if (i > 0){ do{ @@ -185,20 +185,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + a_offset1 += 4; a_offset2 += 4; - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; - + *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; @@ -212,25 +212,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); - + ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + a_offset1 += 2; a_offset2 += 2; - + *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; - + b_offset2 += 4; } - + if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); - + *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; b_offset3 += 2; @@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; - + i = (n >> 2); if (i > 0){ do{ @@ -248,9 +248,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + a_offset1 += 4; - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; @@ -266,11 +266,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); a_offset1 += 2; - + *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; } - + if (n & 1) { ctemp1 = *(a_offset1 + 0); *(b_offset3 + 0) = ctemp1; diff --git a/kernel/generic/gemm_tcopy_6.c b/kernel/generic/gemm_tcopy_6.c index bd32090e7..df4c22122 100644 --- a/kernel/generic/gemm_tcopy_6.c +++ b/kernel/generic/gemm_tcopy_6.c @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + b_offset1 = b_offset; b_offset += 16; @@ -75,17 +75,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); - + ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); @@ -95,27 +95,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; - + *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; *(b_offset1 + 7) = ctemp8; - + *(b_offset1 + 8) = ctemp9; *(b_offset1 + 9) = ctemp10; *(b_offset1 + 10) = ctemp11; *(b_offset1 + 11) = ctemp12; - + *(b_offset1 + 12) = ctemp13; *(b_offset1 + 13) = ctemp14; *(b_offset1 + 14) = ctemp15; *(b_offset1 + 15) = ctemp16; - + b_offset1 += m * 4; i --; }while(i > 0); @@ -127,28 +127,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + ctemp5 = *(a_offset3 + 0); ctemp6 = *(a_offset3 + 1); - + ctemp7 = *(a_offset4 + 0); ctemp8 = *(a_offset4 + 1); - + a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; - + *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; - + *(b_offset2 + 4) = ctemp5; *(b_offset2 + 5) = ctemp6; *(b_offset2 + 6) = ctemp7; *(b_offset2 + 7) = ctemp8; - + b_offset2 += 8; } @@ -157,12 +157,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset2 + 0); ctemp3 = *(a_offset3 + 0); ctemp4 = *(a_offset4 + 0); - + *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; *(b_offset3 + 2) = ctemp3; *(b_offset3 + 3) = ctemp4; - + b_offset3 += 4; } @@ -174,10 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + b_offset1 = b_offset; b_offset += 8; - + i = (n >> 2); if (i > 0){ do{ @@ -185,20 +185,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + a_offset1 += 4; a_offset2 += 4; - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; - + *(b_offset1 + 4) = ctemp5; *(b_offset1 + 5) = ctemp6; *(b_offset1 + 6) = ctemp7; @@ -212,25 +212,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); - + ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + a_offset1 += 2; a_offset2 += 2; - + *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp3; *(b_offset2 + 3) = ctemp4; - + b_offset2 += 4; } - + if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); - + *(b_offset3 + 0) = ctemp1; *(b_offset3 + 1) = ctemp2; b_offset3 += 2; @@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; - + i = (n >> 2); if (i > 0){ do{ @@ -248,9 +248,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + a_offset1 += 4; - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; @@ -266,11 +266,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); a_offset1 += 2; - + *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; } - + if (n & 1) { ctemp1 = *(a_offset1 + 0); *(b_offset3 + 0) = ctemp1; diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c index 8f6e33c8a..9770d110d 100644 --- a/kernel/generic/gemm_tcopy_8.c +++ b/kernel/generic/gemm_tcopy_8.c @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -125,7 +125,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); aoffset5 += 8; - + ctemp41 = *(aoffset6 + 0); ctemp42 = *(aoffset6 + 1); ctemp43 = *(aoffset6 + 2); @@ -155,7 +155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); aoffset6 += 8; - + ctemp49 = *(aoffset7 + 0); ctemp50 = *(aoffset7 + 1); ctemp51 = *(aoffset7 + 2); @@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); aoffset7 += 8; - + ctemp57 = *(aoffset8 + 0); ctemp58 = *(aoffset8 + 1); ctemp59 = *(aoffset8 + 2); @@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); aoffset8 += 8; - + *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; @@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; - + *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; @@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 21) = ctemp22; *(boffset1 + 22) = ctemp23; *(boffset1 + 23) = ctemp24; - + *(boffset1 + 24) = ctemp25; *(boffset1 + 25) = ctemp26; *(boffset1 + 26) = ctemp27; @@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 37) = ctemp38; *(boffset1 + 38) = ctemp39; *(boffset1 + 39) = ctemp40; - + *(boffset1 + 40) = ctemp41; *(boffset1 + 41) = ctemp42; *(boffset1 + 42) = ctemp43; @@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 53) = ctemp54; *(boffset1 + 54) = ctemp55; *(boffset1 + 55) = ctemp56; - + *(boffset1 + 56) = ctemp57; *(boffset1 + 57) = ctemp58; *(boffset1 + 58) = ctemp59; @@ -247,7 +247,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 61) = ctemp62; *(boffset1 + 62) = ctemp63; *(boffset1 + 63) = ctemp64; - + boffset1 += m * 8; i --; }while(i > 0); @@ -259,49 +259,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; - + ctemp17 = *(aoffset5 + 0); ctemp18 = *(aoffset5 + 1); ctemp19 = *(aoffset5 + 2); ctemp20 = *(aoffset5 + 3); aoffset5 += 4; - + ctemp21 = *(aoffset6 + 0); ctemp22 = *(aoffset6 + 1); ctemp23 = *(aoffset6 + 2); ctemp24 = *(aoffset6 + 3); aoffset6 += 4; - + ctemp25 = *(aoffset7 + 0); ctemp26 = *(aoffset7 + 1); ctemp27 = *(aoffset7 + 2); ctemp28 = *(aoffset7 + 3); aoffset7 += 4; - + ctemp29 = *(aoffset8 + 0); ctemp30 = *(aoffset8 + 1); ctemp31 = *(aoffset8 + 2); ctemp32 = *(aoffset8 + 3); aoffset8 += 4; - + *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; @@ -343,35 +343,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; - + ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; - + ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); aoffset5 += 2; - + ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); aoffset6 += 2; - + ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); aoffset7 += 2; - + ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); aoffset8 += 2; - + *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; @@ -408,7 +408,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset7 ++; ctemp08 = *(aoffset8 + 0); aoffset8 ++; - + *(boffset4 + 0) = ctemp01; *(boffset4 + 1) = ctemp02; *(boffset4 + 2) = ctemp03; @@ -431,10 +431,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; - + boffset1 = boffset; boffset += 32; - + i = (n >> 3); if (i > 0){ @@ -448,7 +448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -458,7 +458,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -468,7 +468,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -478,7 +478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); aoffset4 += 8; - + *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; @@ -487,7 +487,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; - + *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; @@ -496,7 +496,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; - + *(boffset1 + 16) = ctemp17; *(boffset1 + 17) = ctemp18; *(boffset1 + 18) = ctemp19; @@ -526,25 +526,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; - + *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; @@ -553,7 +553,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; - + *(boffset2 + 8) = ctemp09; *(boffset2 + 9) = ctemp10; *(boffset2 + 10) = ctemp11; @@ -564,24 +564,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 15) = ctemp16; boffset2 += 16; } - + if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; - + ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; - + *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; @@ -592,7 +592,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset3 + 7) = ctemp08; boffset3 += 8; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; @@ -602,7 +602,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset3 ++; ctemp04 = *(aoffset4 + 0); aoffset4 ++; - + *(boffset4 + 0) = ctemp01; *(boffset4 + 1) = ctemp02; *(boffset4 + 2) = ctemp03; @@ -610,15 +610,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ boffset4 += 4; } } - + if (m & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + boffset1 = boffset; boffset += 16; - + i = (n >> 3); if (i > 0){ do{ @@ -631,7 +631,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -641,7 +641,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; - + *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; @@ -650,7 +650,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; - + *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; @@ -659,25 +659,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; - + boffset1 += 8 * m; i --; }while(i > 0); } - + if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; - + *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; @@ -688,29 +688,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 7) = ctemp08; boffset2 += 8; } - + if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; - + *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; *(boffset3 + 3) = ctemp04; boffset3 += 4; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; - + *(boffset4 + 0) = ctemp01; *(boffset4 + 1) = ctemp02; boffset4 += 2; @@ -720,10 +720,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m & 1){ aoffset1 = aoffset; aoffset += lda; - + boffset1 = boffset; boffset += 8; - + i = (n >> 3); if (i > 0){ do{ @@ -736,7 +736,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; @@ -745,7 +745,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; - + boffset1 += 8 * m; i --; }while(i > 0); @@ -774,7 +774,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset3 + 1) = ctemp02; boffset3 += 2; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index 3645ef154..01f1c67b5 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -3,24 +3,24 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL #ifdef TRMMKERNEL ,BLASLONG offset #endif - ) + ) { BLASLONG i,j,k; FLOAT *C0,*C1,*ptrba,*ptrbb; FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; - for (j=0; j #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, COPY_K(m, x, incx, X, 1); } - while (n > 0) { + while (n > 0) { AXPYU_K(m, 0, 0, alpha * *y, X, 1, a, 1, NULL, 0); a += lda; y += incy; diff --git a/kernel/generic/laswp_ncopy_1.c b/kernel/generic/laswp_ncopy_1.c index 4394474ed..90fe173a8 100644 --- a/kernel/generic/laswp_ncopy_1.c +++ b/kernel/generic/laswp_ncopy_1.c @@ -53,36 +53,36 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint k1 --; ipiv += k1; - + if (n <= 0) return 0; - - + + j = n; do { piv = ipiv; - + a1 = a + k1 + 1; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -93,7 +93,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; @@ -108,7 +108,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 0) = B1; *(buffer + 1) = A2; *b1 = A1; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = A1; @@ -120,24 +120,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b2 = A2; } } - + buffer += 2; - + b1 = a + ip1; b2 = a + ip2; - + a1 += 2; - + i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *a1; B1 = *b1; - + if (a1 == b1) { *(buffer + 0) = A1; } else { @@ -150,5 +150,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } while (j > 0); return 0; -} +} diff --git a/kernel/generic/laswp_ncopy_2.c b/kernel/generic/laswp_ncopy_2.c index 806a1e109..a29562df9 100644 --- a/kernel/generic/laswp_ncopy_2.c +++ b/kernel/generic/laswp_ncopy_2.c @@ -58,27 +58,27 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint ipiv += k1; if (n <= 0) return 0; - + j = (n >> 1); if (j > 0) { do { piv = ipiv; - + a1 = a + k1 + 1; a3 = a1 + 1 * lda; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { @@ -91,16 +91,16 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint A2 = *a2; A3 = *a3; A4 = *a4; - + B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -112,11 +112,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 1) = A3; *(buffer + 2) = B2; *(buffer + 3) = B4; - + *b2 = A2; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; @@ -139,7 +139,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 3) = A4; *b1 = A1; *b3 = A3; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B3; @@ -158,30 +158,30 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b4 = A4; } } - + buffer += 4; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + a1 += 2; a3 += 2; - + i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *a1; B1 = *b1; A3 = *a3; B3 = *b3; - + if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A3; @@ -193,37 +193,37 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } buffer += 2; } - + a += 2 * lda; j --; } while (j > 0); } - + if (n & 1) { piv = ipiv; a1 = a + k1 + 1; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -234,7 +234,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; @@ -249,7 +249,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 0) = B1; *(buffer + 1) = A2; *b1 = A1; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = A1; @@ -261,20 +261,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b2 = A2; } } - + buffer += 2; b1 = a + ip1; b2 = a + ip2; - + a1 += 2; i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *a1; B1 = *b1; @@ -289,5 +289,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } return 0; -} +} diff --git a/kernel/generic/laswp_ncopy_4.c b/kernel/generic/laswp_ncopy_4.c index 0736f0742..761d1584a 100644 --- a/kernel/generic/laswp_ncopy_4.c +++ b/kernel/generic/laswp_ncopy_4.c @@ -69,7 +69,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint piv = ipiv; a1 = a + k1 + 1; - + a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; @@ -77,10 +77,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; @@ -89,7 +89,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint b8 = b2 + 3 * lda; i = ((k2 - k1) >> 1); - + if (i > 0) { do { @@ -117,11 +117,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint B6 = *b6; B7 = *b7; B8 = *b8; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -149,7 +149,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b6 = A6; *b8 = A8; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; @@ -188,7 +188,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b3 = A3; *b5 = A5; *b7 = A7; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B3; @@ -221,19 +221,19 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b8 = A8; } } - + buffer += 8; b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - + a1 += 2; a3 += 2; a5 += 2; @@ -242,9 +242,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *a1; B1 = *b1; @@ -274,7 +274,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } a += 4 * lda; - + j --; } while (j > 0); } @@ -284,35 +284,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint a1 = a + k1 + 1; a3 = a1 + 1 * lda; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; - + B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -328,7 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b2 = A2; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; @@ -351,7 +351,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 3) = A4; *b1 = A1; *b3 = A3; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B3; @@ -370,24 +370,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b4 = A4; } } - + buffer += 4; b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + a1 += 2; a3 += 2; i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *a1; B1 = *b1; @@ -405,7 +405,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } buffer += 2; } - + a += 2 * lda; } @@ -413,27 +413,27 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint piv = ipiv; a1 = a + k1 + 1; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *a1; A2 = *a2; B1 = *b1; B2 = *b2; - + ip1 = *(piv + 0); ip2 = *(piv + 1); piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -444,7 +444,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; @@ -459,7 +459,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 0) = B1; *(buffer + 1) = A2; *b1 = A1; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = A1; @@ -471,20 +471,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *b2 = A2; } } - + buffer += 2; b1 = a + ip1; b2 = a + ip2; - + a1 += 2; i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *a1; B1 = *b1; @@ -499,5 +499,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } return 0; -} +} diff --git a/kernel/generic/laswp_ncopy_8.c b/kernel/generic/laswp_ncopy_8.c index e08c8ceeb..bb7408c61 100644 --- a/kernel/generic/laswp_ncopy_8.c +++ b/kernel/generic/laswp_ncopy_8.c @@ -77,7 +77,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint do { ip = *piv; piv ++; - + dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda * 1; @@ -123,7 +123,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint btemp7 = *dy7; atemp8 = *dx8; btemp8 = *dy8; - + if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; @@ -151,12 +151,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 6) = atemp7; *(buffer + 7) = atemp8; } - + buffer += 8; i++; } while (i <= k2); - + a += 8 * lda; j --; } while (j > 0); @@ -164,10 +164,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint if (n & 4) { piv = ipiv; - + ip = *piv; piv ++; - + dx1 = a + k1; dy1 = a + ip; dx2 = a + k1 + lda * 1; @@ -178,7 +178,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint dy4 = a + ip + lda * 3; i = k1; - + do { atemp1 = *dx1; atemp2 = *dx2; @@ -189,7 +189,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint btemp2 = *dy2; btemp3 = *dy3; btemp4 = *dy4; - + if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; @@ -205,10 +205,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 2) = atemp3; *(buffer + 3) = atemp4; } - + ip = *piv; piv ++; - + i++; dx1 = a + i; dy1 = a + ip; @@ -222,18 +222,18 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint buffer += 4; } while (i <= k2); - + a += 4 * lda; } if (n & 2) { piv = ipiv; - + i = k1; do { ip = *piv; piv ++; - + dx1 = a + i; dy1 = a + ip; dx2 = a + i + lda; @@ -243,7 +243,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint btemp1 = *dy1; atemp2 = *dx2; btemp2 = *dy2; - + if (ip != i) { *dy1 = atemp1; *dy2 = atemp2; @@ -253,44 +253,44 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 0) = atemp1; *(buffer + 1) = atemp2; } - + buffer += 2; i++; } while (i <= k2); - + a += 2 * lda; } if (n & 1) { piv = ipiv; - + i = k1; do { ip = *piv; piv ++; - + dx1 = a + i; dy1 = a + ip; atemp1 = *dx1; btemp1 = *dy1; - + if (ip != i) { *dy1 = atemp1; *buffer = btemp1; } else { *buffer = atemp1; } - + buffer ++; i++; } while (i <= k2); - + a += lda; } return 0; -} +} diff --git a/kernel/generic/neg_tcopy_1.c b/kernel/generic/neg_tcopy_1.c index 3845f0439..ff2339cb4 100644 --- a/kernel/generic/neg_tcopy_1.c +++ b/kernel/generic/neg_tcopy_1.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ b_offset1 = b_offset; b_offset ++; - + j = n; if (j > 0) { do { diff --git a/kernel/generic/neg_tcopy_16.c b/kernel/generic/neg_tcopy_16.c index 2d47b2764..a93372abe 100644 --- a/kernel/generic/neg_tcopy_16.c +++ b/kernel/generic/neg_tcopy_16.c @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 16; - + i = (m >> 1); if (i > 0){ do{ @@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; @@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; - + *(boffset + 16) = -ctemp17; *(boffset + 17) = -ctemp18; *(boffset + 18) = -ctemp19; @@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); - + *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; @@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; - + i = (m >> 1); if (i > 0){ do{ @@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -228,7 +228,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; @@ -237,15 +237,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; @@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + boffset += 8; } } @@ -273,7 +273,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; - + i = (m >> 1); if (i > 0){ do{ @@ -295,15 +295,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; - + boffset += 4; } } @@ -323,7 +323,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; - + i = (m >> 1); if (i > 0){ do{ @@ -336,15 +336,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -358,7 +358,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 1){ aoffset1 = aoffset; aoffset2 = aoffset + lda; - + i = (m >> 1); if (i > 0){ do{ @@ -371,11 +371,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 2; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); *(boffset + 0) = -ctemp01; diff --git a/kernel/generic/neg_tcopy_2.c b/kernel/generic/neg_tcopy_2.c index e4dfa0bce..572f6eb69 100644 --- a/kernel/generic/neg_tcopy_2.c +++ b/kernel/generic/neg_tcopy_2.c @@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ b_offset1 = b_offset; b_offset += 4; - + j = (n >> 1); if (j > 0){ do { @@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ j--; } while (j > 0); } - + if (n & 1){ *(b_offset2 + 0) = -*(a_offset + 0); } diff --git a/kernel/generic/neg_tcopy_4.c b/kernel/generic/neg_tcopy_4.c index 9fb1dc7f9..a080e0e06 100644 --- a/kernel/generic/neg_tcopy_4.c +++ b/kernel/generic/neg_tcopy_4.c @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + b_offset1 = b_offset; b_offset += 16; @@ -75,17 +75,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + ctemp9 = *(a_offset3 + 0); ctemp10 = *(a_offset3 + 1); ctemp11 = *(a_offset3 + 2); ctemp12 = *(a_offset3 + 3); - + ctemp13 = *(a_offset4 + 0); ctemp14 = *(a_offset4 + 1); ctemp15 = *(a_offset4 + 2); @@ -95,27 +95,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; - + *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; - + *(b_offset1 + 4) = -ctemp5; *(b_offset1 + 5) = -ctemp6; *(b_offset1 + 6) = -ctemp7; *(b_offset1 + 7) = -ctemp8; - + *(b_offset1 + 8) = -ctemp9; *(b_offset1 + 9) = -ctemp10; *(b_offset1 + 10) = -ctemp11; *(b_offset1 + 11) = -ctemp12; - + *(b_offset1 + 12) = -ctemp13; *(b_offset1 + 13) = -ctemp14; *(b_offset1 + 14) = -ctemp15; *(b_offset1 + 15) = -ctemp16; - + b_offset1 += m * 4; i --; }while(i > 0); @@ -127,28 +127,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + ctemp5 = *(a_offset3 + 0); ctemp6 = *(a_offset3 + 1); - + ctemp7 = *(a_offset4 + 0); ctemp8 = *(a_offset4 + 1); - + a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; - + *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; *(b_offset2 + 2) = -ctemp3; *(b_offset2 + 3) = -ctemp4; - + *(b_offset2 + 4) = -ctemp5; *(b_offset2 + 5) = -ctemp6; *(b_offset2 + 6) = -ctemp7; *(b_offset2 + 7) = -ctemp8; - + b_offset2 += 8; } @@ -157,12 +157,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset2 + 0); ctemp3 = *(a_offset3 + 0); ctemp4 = *(a_offset4 + 0); - + *(b_offset3 + 0) = -ctemp1; *(b_offset3 + 1) = -ctemp2; *(b_offset3 + 2) = -ctemp3; *(b_offset3 + 3) = -ctemp4; - + b_offset3 += 4; } @@ -174,10 +174,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + b_offset1 = b_offset; b_offset += 8; - + i = (n >> 2); if (i > 0){ do{ @@ -185,20 +185,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset2 + 0); ctemp6 = *(a_offset2 + 1); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + a_offset1 += 4; a_offset2 += 4; - + *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; - + *(b_offset1 + 4) = -ctemp5; *(b_offset1 + 5) = -ctemp6; *(b_offset1 + 6) = -ctemp7; @@ -212,25 +212,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); - + ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + a_offset1 += 2; a_offset2 += 2; - + *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; *(b_offset2 + 2) = -ctemp3; *(b_offset2 + 3) = -ctemp4; - + b_offset2 += 4; } - + if (n & 1) { ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset2 + 0); - + *(b_offset3 + 0) = -ctemp1; *(b_offset3 + 1) = -ctemp2; b_offset3 += 2; @@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; - + i = (n >> 2); if (i > 0){ do{ @@ -248,9 +248,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + a_offset1 += 4; - + *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; @@ -266,11 +266,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); a_offset1 += 2; - + *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; } - + if (n & 1) { ctemp1 = *(a_offset1 + 0); *(b_offset3 + 0) = -ctemp1; diff --git a/kernel/generic/neg_tcopy_8.c b/kernel/generic/neg_tcopy_8.c index 97fec3bd4..a45ecc7d4 100644 --- a/kernel/generic/neg_tcopy_8.c +++ b/kernel/generic/neg_tcopy_8.c @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -125,7 +125,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp39 = *(aoffset5 + 6); ctemp40 = *(aoffset5 + 7); aoffset5 += 8; - + ctemp41 = *(aoffset6 + 0); ctemp42 = *(aoffset6 + 1); ctemp43 = *(aoffset6 + 2); @@ -155,7 +155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp47 = *(aoffset6 + 6); ctemp48 = *(aoffset6 + 7); aoffset6 += 8; - + ctemp49 = *(aoffset7 + 0); ctemp50 = *(aoffset7 + 1); ctemp51 = *(aoffset7 + 2); @@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp55 = *(aoffset7 + 6); ctemp56 = *(aoffset7 + 7); aoffset7 += 8; - + ctemp57 = *(aoffset8 + 0); ctemp58 = *(aoffset8 + 1); ctemp59 = *(aoffset8 + 2); @@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp63 = *(aoffset8 + 6); ctemp64 = *(aoffset8 + 7); aoffset8 += 8; - + *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; @@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; - + *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; @@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 21) = -ctemp22; *(boffset1 + 22) = -ctemp23; *(boffset1 + 23) = -ctemp24; - + *(boffset1 + 24) = -ctemp25; *(boffset1 + 25) = -ctemp26; *(boffset1 + 26) = -ctemp27; @@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 37) = -ctemp38; *(boffset1 + 38) = -ctemp39; *(boffset1 + 39) = -ctemp40; - + *(boffset1 + 40) = -ctemp41; *(boffset1 + 41) = -ctemp42; *(boffset1 + 42) = -ctemp43; @@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 53) = -ctemp54; *(boffset1 + 54) = -ctemp55; *(boffset1 + 55) = -ctemp56; - + *(boffset1 + 56) = -ctemp57; *(boffset1 + 57) = -ctemp58; *(boffset1 + 58) = -ctemp59; @@ -247,7 +247,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 61) = -ctemp62; *(boffset1 + 62) = -ctemp63; *(boffset1 + 63) = -ctemp64; - + boffset1 += m * 8; i --; }while(i > 0); @@ -259,49 +259,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; - + ctemp17 = *(aoffset5 + 0); ctemp18 = *(aoffset5 + 1); ctemp19 = *(aoffset5 + 2); ctemp20 = *(aoffset5 + 3); aoffset5 += 4; - + ctemp21 = *(aoffset6 + 0); ctemp22 = *(aoffset6 + 1); ctemp23 = *(aoffset6 + 2); ctemp24 = *(aoffset6 + 3); aoffset6 += 4; - + ctemp25 = *(aoffset7 + 0); ctemp26 = *(aoffset7 + 1); ctemp27 = *(aoffset7 + 2); ctemp28 = *(aoffset7 + 3); aoffset7 += 4; - + ctemp29 = *(aoffset8 + 0); ctemp30 = *(aoffset8 + 1); ctemp31 = *(aoffset8 + 2); ctemp32 = *(aoffset8 + 3); aoffset8 += 4; - + *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; @@ -343,35 +343,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; - + ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; - + ctemp09 = *(aoffset5 + 0); ctemp10 = *(aoffset5 + 1); aoffset5 += 2; - + ctemp11 = *(aoffset6 + 0); ctemp12 = *(aoffset6 + 1); aoffset6 += 2; - + ctemp13 = *(aoffset7 + 0); ctemp14 = *(aoffset7 + 1); aoffset7 += 2; - + ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); aoffset8 += 2; - + *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; @@ -408,7 +408,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset7 ++; ctemp08 = *(aoffset8 + 0); aoffset8 ++; - + *(boffset4 + 0) = -ctemp01; *(boffset4 + 1) = -ctemp02; *(boffset4 + 2) = -ctemp03; @@ -431,10 +431,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; - + boffset1 = boffset; boffset += 32; - + i = (n >> 3); if (i > 0){ @@ -448,7 +448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -458,7 +458,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -468,7 +468,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); aoffset3 += 8; - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -478,7 +478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp31 = *(aoffset4 + 6); ctemp32 = *(aoffset4 + 7); aoffset4 += 8; - + *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; @@ -487,7 +487,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; - + *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; @@ -496,7 +496,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; - + *(boffset1 + 16) = -ctemp17; *(boffset1 + 17) = -ctemp18; *(boffset1 + 18) = -ctemp19; @@ -526,25 +526,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); aoffset3 += 4; - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); aoffset4 += 4; - + *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; @@ -553,7 +553,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; - + *(boffset2 + 8) = -ctemp09; *(boffset2 + 9) = -ctemp10; *(boffset2 + 10) = -ctemp11; @@ -564,24 +564,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 15) = -ctemp16; boffset2 += 16; } - + if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); aoffset3 += 2; - + ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); aoffset4 += 2; - + *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; @@ -592,7 +592,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset3 + 7) = -ctemp08; boffset3 += 8; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; @@ -602,7 +602,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset3 ++; ctemp04 = *(aoffset4 + 0); aoffset4 ++; - + *(boffset4 + 0) = -ctemp01; *(boffset4 + 1) = -ctemp02; *(boffset4 + 2) = -ctemp03; @@ -610,15 +610,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ boffset4 += 4; } } - + if (m & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + boffset1 = boffset; boffset += 16; - + i = (n >> 3); if (i > 0){ do{ @@ -631,7 +631,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -641,7 +641,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); aoffset2 += 8; - + *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; @@ -650,7 +650,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; - + *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; @@ -659,25 +659,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; - + boffset1 += 8 * m; i --; }while(i > 0); } - + if (n & 4){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); aoffset1 += 4; - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); aoffset2 += 4; - + *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; @@ -688,29 +688,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 7) = -ctemp08; boffset2 += 8; } - + if (n & 2){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); aoffset1 += 2; - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); aoffset2 += 2; - + *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; *(boffset3 + 3) = -ctemp04; boffset3 += 4; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; ctemp02 = *(aoffset2 + 0); aoffset2 ++; - + *(boffset4 + 0) = -ctemp01; *(boffset4 + 1) = -ctemp02; boffset4 += 2; @@ -720,10 +720,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m & 1){ aoffset1 = aoffset; aoffset += lda; - + boffset1 = boffset; boffset += 8; - + i = (n >> 3); if (i > 0){ do{ @@ -736,7 +736,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); aoffset1 += 8; - + *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; @@ -745,7 +745,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; - + boffset1 += 8 * m; i --; }while(i > 0); @@ -774,7 +774,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset3 + 1) = -ctemp02; boffset3 += 2; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); aoffset1 ++; diff --git a/kernel/generic/symm_lcopy_1.c b/kernel/generic/symm_lcopy_1.c index 7b6cfea27..6ec51b835 100644 --- a/kernel/generic/symm_lcopy_1.c +++ b/kernel/generic/symm_lcopy_1.c @@ -50,14 +50,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; diff --git a/kernel/generic/symm_lcopy_16.c b/kernel/generic/symm_lcopy_16.c index 2c8ad81d0..477546f26 100644 --- a/kernel/generic/symm_lcopy_16.c +++ b/kernel/generic/symm_lcopy_16.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; @@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (offset > -13) ao14 += lda; else ao14 ++; if (offset > -14) ao15 += lda; else ao15 ++; if (offset > -15) ao16 += lda; else ao16 ++; - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 8) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; @@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; @@ -188,7 +188,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; @@ -223,7 +223,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; @@ -232,7 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; @@ -250,14 +250,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; diff --git a/kernel/generic/symm_lcopy_2.c b/kernel/generic/symm_lcopy_2.c index e7944c447..2337d5ca6 100644 --- a/kernel/generic/symm_lcopy_2.c +++ b/kernel/generic/symm_lcopy_2.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; @@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; @@ -79,14 +79,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; diff --git a/kernel/generic/symm_lcopy_4.c b/kernel/generic/symm_lcopy_4.c index ac04943e2..ca730e1ee 100644 --- a/kernel/generic/symm_lcopy_4.c +++ b/kernel/generic/symm_lcopy_4.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; @@ -96,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; @@ -115,14 +115,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; diff --git a/kernel/generic/symm_lcopy_6.c b/kernel/generic/symm_lcopy_6.c index ac04943e2..ca730e1ee 100644 --- a/kernel/generic/symm_lcopy_6.c +++ b/kernel/generic/symm_lcopy_6.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; @@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; @@ -96,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; @@ -115,14 +115,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; diff --git a/kernel/generic/symm_lcopy_8.c b/kernel/generic/symm_lcopy_8.c index c315574ea..11dae9ac1 100644 --- a/kernel/generic/symm_lcopy_8.c +++ b/kernel/generic/symm_lcopy_8.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; @@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; @@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; if (offset > -2) ao3 += lda; else ao3 ++; @@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; @@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; if (offset > -1) ao2 += lda; else ao2 ++; @@ -164,14 +164,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 += lda; else ao1 ++; b[ 0] = data01; diff --git a/kernel/generic/symm_ucopy_1.c b/kernel/generic/symm_ucopy_1.c index 4ab9bb422..d87500ffe 100644 --- a/kernel/generic/symm_ucopy_1.c +++ b/kernel/generic/symm_ucopy_1.c @@ -50,14 +50,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/symm_ucopy_16.c b/kernel/generic/symm_ucopy_16.c index 094810b97..9b671db8d 100644 --- a/kernel/generic/symm_ucopy_16.c +++ b/kernel/generic/symm_ucopy_16.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao14 + 0); data15 = *(ao15 + 0); data16 = *(ao16 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; @@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 8) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; @@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; @@ -189,7 +189,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; @@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; @@ -224,7 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; @@ -233,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; @@ -245,20 +245,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/symm_ucopy_2.c b/kernel/generic/symm_ucopy_2.c index 6396b746b..56df894b1 100644 --- a/kernel/generic/symm_ucopy_2.c +++ b/kernel/generic/symm_ucopy_2.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; @@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; @@ -78,14 +78,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/symm_ucopy_4.c b/kernel/generic/symm_ucopy_4.c index 9b9cff820..6dbb861e9 100644 --- a/kernel/generic/symm_ucopy_4.c +++ b/kernel/generic/symm_ucopy_4.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; @@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; @@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; @@ -107,20 +107,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/symm_ucopy_6.c b/kernel/generic/symm_ucopy_6.c index 9b9cff820..6dbb861e9 100644 --- a/kernel/generic/symm_ucopy_6.c +++ b/kernel/generic/symm_ucopy_6.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; @@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; @@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; @@ -107,20 +107,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/symm_ucopy_8.c b/kernel/generic/symm_ucopy_8.c index 411768ba5..3da9385a4 100644 --- a/kernel/generic/symm_ucopy_8.c +++ b/kernel/generic/symm_ucopy_8.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao6 + 0); data07 = *(ao7 + 0); data08 = *(ao8 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; @@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; @@ -116,7 +116,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao2 + 0); data03 = *(ao3 + 0); data04 = *(ao4 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; if (offset > -2) ao3 ++; else ao3 += lda; @@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; @@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; if (offset > -1) ao2 ++; else ao2 += lda; @@ -159,20 +159,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; i = m; while (i > 0) { data01 = *(ao1 + 0); - + if (offset > 0) ao1 ++; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/symv_k.c b/kernel/generic/symv_k.c index bd882fe85..c5817e7f9 100644 --- a/kernel/generic/symv_k.c +++ b/kernel/generic/symv_k.c @@ -72,15 +72,15 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, for(is = 0; is < offset; is += SYMV_P){ min_i = MIN(offset - is, SYMV_P); #endif - + #ifndef LOWER if (is >0){ - GEMV_T(is, min_i, 0, alpha, + GEMV_T(is, min_i, 0, alpha, a + is * lda, lda, X, 1, Y + is, 1, gemvbuffer); - GEMV_N(is, min_i, 0, alpha, + GEMV_N(is, min_i, 0, alpha, a + is * lda, lda, X + is, 1, Y, 1, gemvbuffer); @@ -92,20 +92,20 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, #else SYMCOPY_U(min_i, a + is + is * lda, lda, symbuffer); #endif - - GEMV_N(min_i, min_i, 0, alpha, + + GEMV_N(min_i, min_i, 0, alpha, symbuffer, min_i, - X + is, 1, + X + is, 1, Y + is, 1, gemvbuffer); #ifdef LOWER if (m - is > min_i){ - GEMV_T(m - is - min_i, min_i, 0, alpha, + GEMV_T(m - is - min_i, min_i, 0, alpha, a + (is + min_i) + is * lda, lda, X + (is + min_i), 1, Y + is, 1, gemvbuffer); - - GEMV_N(m - is - min_i, min_i, 0, alpha, + + GEMV_N(m - is - min_i, min_i, 0, alpha, a + (is + min_i) + is * lda, lda, X + is, 1, Y + (is + min_i), 1, gemvbuffer); diff --git a/kernel/generic/trmm_lncopy_1.c b/kernel/generic/trmm_lncopy_1.c index 66e407f80..542c4c361 100644 --- a/kernel/generic/trmm_lncopy_1.c +++ b/kernel/generic/trmm_lncopy_1.c @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; ao1 += 1; b += 1; - } else + } else if (X < posY) { ao1 += lda; b += 1; diff --git a/kernel/generic/trmm_lncopy_16.c b/kernel/generic/trmm_lncopy_16.c index a18340297..0795a8386 100644 --- a/kernel/generic/trmm_lncopy_16.c +++ b/kernel/generic/trmm_lncopy_16.c @@ -88,13 +88,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 = a + posX + (posY + 14) * lda; a16 = a + posX + (posY + 15) * lda; } - + i = (m >> 4); if (i > 0) { do { if (X > posY) { for (ii = 0; ii < 16; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); @@ -112,7 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); - + a01 ++; a02 ++; a03 ++; @@ -131,7 +131,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a16 ++; b += 16; } - } else + } else if (X < posY) { a01 += 16 * lda; a02 += 16 * lda; @@ -171,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; - + b[ 16] = *(a01 + 1); #ifdef UNIT b[ 17] = ONE; @@ -504,7 +504,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 += 16; a16 += 16; b += 256; - + } X += 16; @@ -514,10 +514,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 15); if (i) { - + if (X > posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -526,7 +526,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); @@ -535,7 +535,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); - + a01 ++; a02 ++; a03 ++; @@ -554,7 +554,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a16 ++; b += 16; } - } else + } else if (X < posY) { a01 += i * lda; a02 += i * lda; @@ -968,7 +968,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X > posY) { for (ii = 0; ii < 8; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -977,7 +977,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + a01 ++; a02 ++; a03 ++; @@ -988,7 +988,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a08 ++; b += 8; } - } else + } else if (X < posY) { a01 += 8 * lda; a02 += 8 * lda; @@ -1012,7 +1012,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = *(a01 + 1); #ifdef UNIT b[ 9] = ONE; @@ -1122,10 +1122,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X > posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -1134,7 +1134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + a01 ++; a02 ++; a03 ++; @@ -1145,7 +1145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a08 ++; b += 8; } - } else + } else if (X < posY) { a01 += i * lda; a02 += i * lda; @@ -1293,19 +1293,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X > posY) { for (ii = 0; ii < 4; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); - + a01 ++; a02 ++; a03 ++; a04 ++; b += 4; } - } else + } else if (X < posY) { a01 += 4 * lda; a02 += 4 * lda; @@ -1321,7 +1321,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; - + b[ 4] = *(a01 + 1); #ifdef UNIT b[ 5] = ONE; @@ -1363,22 +1363,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); - + a01 ++; a02 ++; a03 ++; a04 ++; b += 4; } - } else + } else if (X < posY) { a01 += i * lda; a02 += i * lda; @@ -1447,7 +1447,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += 2; a02 += 2; b += 4; - } else + } else if (X < posY) { a01 += 2 * lda; a02 += 2 * lda; @@ -1478,7 +1478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); @@ -1486,7 +1486,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 ++; a02 ++; b += 2; - } else + } else if (X < posY) { a01 += lda; a02 += lda; @@ -1520,7 +1520,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = *(a01 + 0); a01 += 1; b += 1; - } else + } else if (X < posY) { a01 += lda; b += 1; diff --git a/kernel/generic/trmm_lncopy_2.c b/kernel/generic/trmm_lncopy_2.c index f7fefaaad..ed28b661b 100644 --- a/kernel/generic/trmm_lncopy_2.c +++ b/kernel/generic/trmm_lncopy_2.c @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data03; b[ 2] = data02; @@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -113,31 +113,31 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { data01 = *(ao1 + 0); data03 = *(ao2 + 0); - + b[ 0] = data01; b[ 1] = data03; ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X < posY) { ao1 += lda; b += 2; } else { #ifdef UNIT data03 = *(ao2 + 0); - + b[ 0] = ONE; b[ 1] = data03; #else data01 = *(ao1 + 0); data03 = *(ao2 + 0); - + b[ 0] = data01; b[ 1] = data03; #endif @@ -171,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; ao1 += 1; b += 1; - } else + } else if (X < posY) { ao1 += lda; b += 1; diff --git a/kernel/generic/trmm_lncopy_4.c b/kernel/generic/trmm_lncopy_4.c index 6cd16673a..0dcfb965a 100644 --- a/kernel/generic/trmm_lncopy_4.c +++ b/kernel/generic/trmm_lncopy_4.c @@ -74,22 +74,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; @@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; - + b[ 8] = data03; b[ 9] = data07; b[10] = data11; @@ -107,14 +107,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data08; b[14] = data12; b[15] = data16; - + ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; - } else + } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -127,10 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data12 = *(ao3 + 3); b[ 0] = ONE; @@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data03; b[ 9] = data07; b[10] = ONE; @@ -155,16 +155,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; @@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data03; b[ 9] = data07; b[10] = data11; @@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { @@ -225,7 +225,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); @@ -236,28 +236,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } - - } else + + } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; - + b += 8; } - + if (m & 1) { ao1 += lda; b += 4; } - + } else { #ifdef UNIT data05 = *(ao2 + 0); @@ -272,13 +272,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i >= 3) { data15 = *(ao4 + 2); } - + b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; @@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data14; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -310,13 +310,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data11 = *(ao3 + 2); data15 = *(ao4 + 2); } - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; @@ -324,7 +324,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data14; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -361,7 +361,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data02; @@ -371,7 +371,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif ao1 += 2; ao2 += 2; - + b += 4; } @@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); @@ -417,7 +417,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X < posY) { ao1 += lda; b += 2; diff --git a/kernel/generic/trmm_lncopy_6.c b/kernel/generic/trmm_lncopy_6.c index 6cd16673a..0dcfb965a 100644 --- a/kernel/generic/trmm_lncopy_6.c +++ b/kernel/generic/trmm_lncopy_6.c @@ -74,22 +74,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; @@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; - + b[ 8] = data03; b[ 9] = data07; b[10] = data11; @@ -107,14 +107,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data08; b[14] = data12; b[15] = data16; - + ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; - } else + } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -127,10 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data12 = *(ao3 + 3); b[ 0] = ONE; @@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data03; b[ 9] = data07; b[10] = ONE; @@ -155,16 +155,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; @@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data03; b[ 9] = data07; b[10] = data11; @@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { @@ -225,7 +225,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); @@ -236,28 +236,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + ao1 += 1; ao2 += 1; ao3 += 1; ao4 += 1; b += 4; } - - } else + + } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; - + b += 8; } - + if (m & 1) { ao1 += lda; b += 4; } - + } else { #ifdef UNIT data05 = *(ao2 + 0); @@ -272,13 +272,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i >= 3) { data15 = *(ao4 + 2); } - + b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; @@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data14; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -310,13 +310,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data11 = *(ao3 + 2); data15 = *(ao4 + 2); } - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; @@ -324,7 +324,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data14; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -361,7 +361,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data02; @@ -371,7 +371,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif ao1 += 2; ao2 += 2; - + b += 4; } @@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao2 + 0); @@ -417,7 +417,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X < posY) { ao1 += lda; b += 2; diff --git a/kernel/generic/trmm_lncopy_8.c b/kernel/generic/trmm_lncopy_8.c index 4a1964bd7..8f5fbce87 100644 --- a/kernel/generic/trmm_lncopy_8.c +++ b/kernel/generic/trmm_lncopy_8.c @@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -111,7 +111,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); @@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); @@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data46 = *(ao6 + 5); data47 = *(ao6 + 6); data48 = *(ao6 + 7); - + data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); @@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); @@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 64; - } else + } else if (X < posY) { ao1 += 8 * lda; ao2 += 8 * lda; @@ -250,7 +250,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 64; } else { @@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + #ifndef UNIT data10 = *(ao2 + 1); #endif @@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + #ifndef UNIT data19 = *(ao3 + 2); #endif @@ -283,7 +283,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + #ifndef UNIT data28 = *(ao4 + 3); #endif @@ -291,25 +291,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + #ifndef UNIT data37 = *(ao5 + 4); #endif data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); - + #ifndef UNIT data46 = *(ao6 + 5); #endif data47 = *(ao6 + 6); data48 = *(ao6 + 7); - + #ifndef UNIT data55 = *(ao7 + 6); #endif data56 = *(ao7 + 7); - + #ifndef UNIT data64 = *(ao8 + 7); #endif @@ -326,7 +326,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data02; #ifdef UNIT b[ 9] = ONE; @@ -352,7 +352,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; - + b[24] = data04; b[25] = data12; b[26] = data20; @@ -378,7 +378,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[37] = ZERO; b[38] = ZERO; b[39] = ZERO; - + b[40] = data06; b[41] = data14; b[42] = data22; @@ -417,7 +417,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else b[63] = data64; #endif - + ao1 += 8; ao2 += 8; ao3 += 8; @@ -426,7 +426,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8; ao7 += 8; ao8 += 8; - + b += 64; } @@ -437,7 +437,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X > posY) { if (m & 4) { @@ -445,42 +445,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); - + data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; @@ -489,7 +489,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; - + b[ 8] = data02; b[ 9] = data10; b[10] = data18; @@ -498,7 +498,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data42; b[14] = data50; b[15] = data58; - + b[16] = data03; b[17] = data11; b[18] = data19; @@ -507,7 +507,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data43; b[22] = data51; b[23] = data59; - + b[24] = data04; b[25] = data12; b[26] = data20; @@ -525,35 +525,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 4; ao7 += 4; ao8 += 4; - + b += 32; } - + if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); - + data49 = *(ao7 + 0); data50 = *(ao7 + 1); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; @@ -562,7 +562,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; - + b[ 8] = data02; b[ 9] = data10; b[10] = data18; @@ -571,7 +571,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data42; b[14] = data50; b[15] = data58; - + ao1 += 2; ao2 += 2; ao3 += 2; @@ -580,10 +580,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 2; ao7 += 2; ao8 += 2; - + b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); @@ -593,7 +593,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data41 = *(ao6 + 0); data49 = *(ao7 + 0); data57 = *(ao8 + 0); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; @@ -602,25 +602,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; - + b += 8; } - } else + } else if (X < posY) { if (m & 4) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 32; } - + if (m & 2) { ao1 += 2 * lda; b += 16; } - + if (m & 1) { b += 8; } @@ -659,7 +659,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data23 = *(ao3 + 6); data24 = *(ao3 + 7); } - + if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); @@ -707,7 +707,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = ZERO; b[ 7] = ZERO; b += 8; - + if(i >= 2) { b[ 0] = data02; #ifdef UNIT @@ -723,7 +723,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - + if (i >= 3) { b[ 0] = data03; b[ 1] = data11; @@ -739,8 +739,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - - if (i >= 4) { + + if (i >= 4) { b[ 0] = data04; b[ 1] = data12; b[ 2] = data20; @@ -771,7 +771,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - + if (i >= 6) { b[ 0] = data06; b[ 1] = data14; @@ -835,37 +835,37 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; - + b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; - + b[ 8] = data03; b[ 9] = data11; b[10] = data19; b[11] = data27; - + b[12] = data04; b[13] = data12; b[14] = data20; @@ -878,7 +878,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 16; - } else + } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -957,7 +957,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; ao3 += 4; ao4 += 4; - + b += 16; } @@ -968,60 +968,60 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; - + b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; - + b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; - + b += 4; } - } else + } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; b += 8; } - + if (m & 1) { b += 4; } @@ -1049,7 +1049,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif data20 = *(ao3 + 3); } - + #ifdef UNIT b[ 0] = ONE; #else @@ -1059,7 +1059,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - + if(i >= 2) { b[ 0] = data02; #ifdef UNIT @@ -1071,7 +1071,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = ZERO; b += 4; } - + if (i >= 3) { b[ 0] = data03; b[ 1] = data11; @@ -1109,7 +1109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data02; @@ -1119,7 +1119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -1156,15 +1156,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); - + b[ 0] = data01; b[ 1] = data09; b += 2; - } else + } else if (X < posY) { b += 2; } else { @@ -1201,7 +1201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; ao1 += 1; b += 1; - } else + } else if (X < posY) { ao1 += lda; b += 1; diff --git a/kernel/generic/trmm_ltcopy_1.c b/kernel/generic/trmm_ltcopy_1.c index ab5e9d8e9..d79f1a78c 100644 --- a/kernel/generic/trmm_ltcopy_1.c +++ b/kernel/generic/trmm_ltcopy_1.c @@ -58,11 +58,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (i > 0) { do { - + if (X > posY) { ao1 += 1; b += 1; - } else + } else if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; diff --git a/kernel/generic/trmm_ltcopy_16.c b/kernel/generic/trmm_ltcopy_16.c index 0598de896..b8469d00a 100644 --- a/kernel/generic/trmm_ltcopy_16.c +++ b/kernel/generic/trmm_ltcopy_16.c @@ -110,11 +110,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 += 16; a16 += 16; b += 256; - } else + } else if (X < posY) { for (ii = 0; ii < 16; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -123,7 +123,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; b += 16; } @@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; - + } else { #ifdef UNIT b[ 0] = ONE; @@ -174,7 +174,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + b[ 16] = ZERO; #ifdef UNIT b[ 17] = ONE; @@ -506,7 +506,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a14 += 16; a15 += 16; a16 += 16; - + b += 256; } @@ -535,11 +535,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 += i; a16 += i; b += 16 * i; - } else + } else if (X < posY) { - + for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -548,7 +548,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -557,7 +557,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; a02 += lda; a03 += lda; @@ -598,7 +598,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); b += 16; - + if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -622,7 +622,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = *(a02 + 15); b += 16; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -965,7 +965,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 = a + posX + (posY + 6) * lda; a08 = a + posX + (posY + 7) * lda; } - + i = (m >> 3); if (i > 0) { do { @@ -979,9 +979,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += 8; a08 += 8; b += 64; - } else + } else if (X < posY) { - + for (ii = 0; ii < 8; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -1042,7 +1042,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 21] = *(a03 + 5); b[ 22] = *(a03 + 6); b[ 23] = *(a03 + 7); - + b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; @@ -1081,7 +1081,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif b[ 46] = *(a06 + 6); b[ 47] = *(a06 + 7); - + b[ 48] = ZERO; b[ 49] = ZERO; b[ 50] = ZERO; @@ -1094,7 +1094,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 54] = *(a07 + 6); #endif b[ 55] = *(a07 + 7); - + b[ 56] = ZERO; b[ 57] = ZERO; b[ 58] = ZERO; @@ -1117,7 +1117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += 8; a08 += 8; b += 64; - + } X += 8; @@ -1137,11 +1137,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += i; a08 += i; b += 8 * i; - } else + } else if (X < posY) { - + for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -1150,7 +1150,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + a01 += lda; a02 += lda; a03 += lda; @@ -1175,7 +1175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); b += 8; - + if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -1191,7 +1191,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = *(a02 + 7); b += 8; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -1290,7 +1290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 = a + posX + (posY + 2) * lda; a04 = a + posX + (posY + 3) * lda; } - + i = (m >> 2); if (i > 0) { do { @@ -1300,9 +1300,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += 4; a04 += 4; b += 16; - } else + } else if (X < posY) { - + for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -1343,7 +1343,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 10] = *(a03 + 2); #endif b[ 11] = *(a03 + 3); - + b[ 12] = ZERO; b[ 13] = ZERO; b[ 14] = ZERO; @@ -1359,12 +1359,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a04 += 4; b += 16; } - + X += 4; i --; } while (i > 0); } - + i = (m & 3); if (i > 0) { if (X > posY) { @@ -1373,11 +1373,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += i; a04 += i; b += 4 * i; - } else + } else if (X < posY) { - + for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -1390,7 +1390,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 4; } } else { - + #ifdef UNIT b[ 0] = ONE; #else @@ -1400,7 +1400,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); b += 4; - + if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -1412,7 +1412,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = *(a02 + 3); b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -1439,7 +1439,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 = a + posX + (posY + 0) * lda; a02 = a + posX + (posY + 1) * lda; } - + i = (m >> 1); if (i > 0) { do { @@ -1447,7 +1447,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += 2; a02 += 2; b += 4; - } else + } else if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -1475,18 +1475,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a02 += 2; b += 4; } - + X += 2; i --; } while (i > 0); } - + if (m & 1) { if (X > posY) { a01 ++; a02 ++; b += 2; - } else + } else if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -1514,15 +1514,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { a01 = a + posX + (posY + 0) * lda; } - + i = m; if (i > 0) { do { - + if (X > posY) { b ++; a01 ++; - } else + } else if (X < posY) { b[ 0] = *(a01 + 0); a01 += lda; diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c index 098e16f96..e9ad45fa0 100644 --- a/kernel/generic/trmm_ltcopy_2.c +++ b/kernel/generic/trmm_ltcopy_2.c @@ -68,13 +68,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -114,16 +114,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; ao1 += lda; @@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; #endif @@ -164,11 +164,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (i > 0) { do { - + if (X > posY) { ao1 += 1; b += 1; - } else + } else if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; diff --git a/kernel/generic/trmm_ltcopy_4.c b/kernel/generic/trmm_ltcopy_4.c index 69a233be6..66a7325bb 100644 --- a/kernel/generic/trmm_ltcopy_4.c +++ b/kernel/generic/trmm_ltcopy_4.c @@ -76,28 +76,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 4; b += 16; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -128,12 +128,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data12 = *(ao3 + 3); - + b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; @@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ONE; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; @@ -158,14 +158,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data16 = *(ao4 + 3); b[ 0] = data01; @@ -176,7 +176,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { @@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - + if (m & 1) { ao1 += 1; ao2 += 1; @@ -218,8 +218,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 1; b += 4; } - - } else + + } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -239,28 +239,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + ao1 += 2 * lda; ao2 += 2 * lda; - + b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + ao1 += lda; b += 4; } - + } else { #ifdef UNIT @@ -276,13 +276,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i >= 3) { data12 = *(ao3 + 3); } - + b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; @@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data08; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -314,13 +314,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data11 = *(ao3 + 2); data12 = *(ao3 + 3); } - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; @@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data08; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -365,7 +365,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -410,17 +410,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X > posY) { ao1 += 1; ao2 += 1; - + b += 2; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; ao1 += lda; diff --git a/kernel/generic/trmm_ltcopy_6.c b/kernel/generic/trmm_ltcopy_6.c index 69a233be6..66a7325bb 100644 --- a/kernel/generic/trmm_ltcopy_6.c +++ b/kernel/generic/trmm_ltcopy_6.c @@ -76,28 +76,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 4; b += 16; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -128,12 +128,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data12 = *(ao3 + 3); - + b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; @@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ONE; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; @@ -158,14 +158,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data16 = *(ao4 + 3); b[ 0] = data01; @@ -176,7 +176,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { @@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - + if (m & 1) { ao1 += 1; ao2 += 1; @@ -218,8 +218,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 1; b += 4; } - - } else + + } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -239,28 +239,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + ao1 += 2 * lda; ao2 += 2 * lda; - + b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + ao1 += lda; b += 4; } - + } else { #ifdef UNIT @@ -276,13 +276,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i >= 3) { data12 = *(ao3 + 3); } - + b[ 0] = ONE; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; @@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data08; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -314,13 +314,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data11 = *(ao3 + 2); data12 = *(ao3 + 3); } - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; @@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data08; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -365,7 +365,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -410,17 +410,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X > posY) { ao1 += 1; ao2 += 1; - + b += 2; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; ao1 += lda; diff --git a/kernel/generic/trmm_ltcopy_8.c b/kernel/generic/trmm_ltcopy_8.c index 64954da40..101272829 100644 --- a/kernel/generic/trmm_ltcopy_8.c +++ b/kernel/generic/trmm_ltcopy_8.c @@ -96,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 64; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -133,7 +133,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); @@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); @@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); @@ -169,7 +169,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -178,7 +178,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -196,7 +196,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -214,7 +214,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[37] = data38; b[38] = data39; b[39] = data40; - + b[40] = data41; b[41] = data42; b[42] = data43; @@ -241,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[61] = data62; b[62] = data63; b[63] = data64; - + ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; @@ -250,7 +250,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 64; } else { @@ -265,7 +265,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + #ifndef UNIT data10 = *(ao2 + 1); #endif @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + #ifndef UNIT data28 = *(ao4 + 3); #endif @@ -292,14 +292,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + #ifndef UNIT data37 = *(ao5 + 4); #endif data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); - + #ifndef UNIT data46 = *(ao6 + 5); #endif @@ -310,7 +310,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data55 = *(ao7 + 6); #endif data56 = *(ao7 + 7); - + #ifndef UNIT data64 = *(ao8 + 7); #endif @@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; @@ -354,7 +354,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; @@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[37] = data38; b[38] = data39; b[39] = data40; - + b[40] = ZERO; b[41] = ZERO; b[42] = ZERO; @@ -419,7 +419,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else b[63] = data64; #endif - + ao1 += 8; ao2 += 8; ao3 += 8; @@ -428,7 +428,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8; ao7 += 8; ao8 += 8; - + b += 64; } @@ -439,7 +439,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X > posY) { if (m & 4) { @@ -451,10 +451,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 4; ao7 += 4; ao8 += 4; - + b += 32; } - + if (m & 2) { ao1 += 2; ao2 += 2; @@ -464,14 +464,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 2; ao7 += 2; ao8 += 2; - + b += 16; } - + if (m & 1) { b += 8; } - } else + } else if (X < posY) { if (m & 4) { data01 = *(ao1 + 0); @@ -482,7 +482,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -491,7 +491,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -500,7 +500,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -509,7 +509,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -527,7 +527,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + b[16] = data17; b[17] = data18; b[18] = data19; @@ -536,7 +536,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -545,15 +545,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[29] = data30; b[30] = data31; b[31] = data32; - + ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 32; } - + if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -563,7 +563,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -572,7 +572,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -581,7 +581,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -590,11 +590,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + ao1 += 2 * lda; b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -613,7 +613,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b += 8; } } else { @@ -650,7 +650,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data23 = *(ao3 + 6); data24 = *(ao3 + 7); } - + if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); @@ -698,7 +698,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = data07; b[ 7] = data08; b += 8; - + if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -714,7 +714,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data16; b += 8; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -730,8 +730,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data24; b += 8; } - - if (i >= 4) { + + if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; @@ -762,7 +762,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data40; b += 8; } - + if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -829,7 +829,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 16; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -855,7 +855,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -949,7 +949,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; ao3 += 4; ao4 += 4; - + b += 16; } @@ -960,7 +960,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { @@ -968,14 +968,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; ao3 += 2; ao4 += 2; - + b += 8; } - + if (m & 1) { b += 4; } - } else + } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -987,7 +987,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -996,11 +996,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; - + ao1 += 2 * lda; b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -1011,7 +1011,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + b += 4; } } else { @@ -1038,7 +1038,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif data20 = *(ao3 + 3); } - + #ifdef UNIT b[ 0] = ONE; #else @@ -1048,7 +1048,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = data03; b[ 3] = data04; b += 4; - + if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -1060,7 +1060,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data12; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -1097,7 +1097,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -1106,7 +1106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2 * lda; ao2 += 2 * lda; - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -1147,10 +1147,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { b += 2; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -1190,11 +1190,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X > posY) { ao1 += 1; b += 1; - } else + } else if (X < posY) { data01 = *(ao1 + 0); ao1 += lda; - + b[ 0] = data01; b += 1; diff --git a/kernel/generic/trmm_uncopy_1.c b/kernel/generic/trmm_uncopy_1.c index 6e75c2fa5..f77c310c8 100644 --- a/kernel/generic/trmm_uncopy_1.c +++ b/kernel/generic/trmm_uncopy_1.c @@ -48,17 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (n > 0) { X = posX; - + if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } - + i = m; if (m > 0) { do { - + if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; @@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 1; ao1 += lda; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/trmm_uncopy_16.c b/kernel/generic/trmm_uncopy_16.c index 6325a26a0..19b2fdd68 100644 --- a/kernel/generic/trmm_uncopy_16.c +++ b/kernel/generic/trmm_uncopy_16.c @@ -88,13 +88,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 = a + posY + (posX + 14) * lda; a16 = a + posY + (posX + 15) * lda; } - + i = (m >> 4); if (i > 0) { do { if (X < posY) { for (ii = 0; ii < 16; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); @@ -112,7 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); - + a01 ++; a02 ++; a03 ++; @@ -131,7 +131,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a16 ++; b += 16; } - } else + } else if (X > posY) { a01 += 16 * lda; a02 += 16 * lda; @@ -171,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); - + b[ 16] = ZERO; #ifdef UNIT b[ 17] = ONE; @@ -503,7 +503,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; - + b += 256; } @@ -514,10 +514,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 15); if (i) { - + if (X < posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -526,7 +526,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + b[ 8] = *(a09 + 0); b[ 9] = *(a10 + 0); b[ 10] = *(a11 + 0); @@ -535,7 +535,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a14 + 0); b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); - + a01 ++; a02 ++; a03 ++; @@ -554,7 +554,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a16 ++; b += 16; } - } else + } else if (X > posY) { a01 += i * lda; a02 += i * lda; @@ -595,7 +595,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 14] = *(a15 + 0); b[ 15] = *(a16 + 0); b += 16; - + if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -739,7 +739,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 15] = *(a16 + 6); b += 16; } - + if (i >= 8) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -968,7 +968,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { for (ii = 0; ii < 8; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -977,7 +977,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + a01 ++; a02 ++; a03 ++; @@ -988,7 +988,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a08 ++; b += 8; } - } else + } else if (X > posY) { a01 += 8 * lda; a02 += 8 * lda; @@ -1012,7 +1012,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; @@ -1122,10 +1122,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X < posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -1134,7 +1134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a06 + 0); b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); - + a01 ++; a02 ++; a03 ++; @@ -1145,7 +1145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a08 ++; b += 8; } - } else + } else if (X > posY) { a01 += i * lda; a02 += i * lda; @@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = *(a07 + 0); b[ 7] = *(a08 + 0); b += 8; - + if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -1292,7 +1292,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { for (ii = 0; ii < 4; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); @@ -1304,7 +1304,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a04 ++; b += 4; } - } else + } else if (X > posY) { a01 += 4 * lda; a02 += 4 * lda; @@ -1320,7 +1320,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); - + b[ 4] = ZERO; #ifdef UNIT b[ 5] = ONE; @@ -1362,22 +1362,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); - + a01 ++; a02 ++; a03 ++; a04 ++; b += 4; } - } else + } else if (X > posY) { a01 += i * lda; a02 += i * lda; @@ -1394,7 +1394,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = *(a03 + 0); b[ 3] = *(a04 + 0); b += 4; - + if (i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -1443,11 +1443,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = *(a02 + 0); b[ 2] = *(a01 + 1); b[ 3] = *(a02 + 1); - + a01 += 2; a02 += 2; b += 4; - } else + } else if (X > posY) { a01 += 2 * lda; a02 += 2 * lda; @@ -1459,7 +1459,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = *(a01 + 0); #endif b[ 1] = *(a02 + 0); - + b[ 2] = ZERO; #ifdef UNIT b[ 3] = ONE; @@ -1478,15 +1478,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a02 + 0); - + a01 ++; a02 ++; b += 2; - } else + } else if (X > posY) { a01 += lda; a02 += lda; @@ -1520,7 +1520,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = *(a01 + 0); a01 += 1; b += 1; - } else + } else if (X > posY) { a01 += lda; b += 1; diff --git a/kernel/generic/trmm_uncopy_2.c b/kernel/generic/trmm_uncopy_2.c index 1b6d2356a..61303a2ba 100644 --- a/kernel/generic/trmm_uncopy_2.c +++ b/kernel/generic/trmm_uncopy_2.c @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data03; b[ 2] = data02; @@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -114,18 +114,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X < posY) { data01 = *(ao1 + 0); data03 = *(ao2 + 0); - + b[ 0] = data01; b[ 1] = data03; - + ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X > posY) { ao1 += lda; b += 2; @@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data01 = *(ao1 + 0); data03 = *(ao2 + 0); - + b[ 0] = data01; b[ 1] = data03; #endif @@ -154,17 +154,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1){ X = posX; - + if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } - + i = m; if (m > 0) { do { - + if (X < posY) { data01 = *(ao1 + 0); b[ 0] = data01; @@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 1; ao1 += lda; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/trmm_uncopy_4.c b/kernel/generic/trmm_uncopy_4.c index 4ff694839..0218a0e31 100644 --- a/kernel/generic/trmm_uncopy_4.c +++ b/kernel/generic/trmm_uncopy_4.c @@ -74,22 +74,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; @@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; - + b[ 8] = data03; b[ 9] = data07; b[10] = data11; @@ -107,13 +107,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data08; b[14] = data12; b[15] = data16; - + ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; - } else + } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -124,14 +124,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { #ifdef UNIT data05 = *(ao2 + 0); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); - + b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; @@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ONE; b[ 6] = data10; b[ 7] = data14; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; @@ -153,19 +153,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = ONE; #else data01 = *(ao1 + 0); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; @@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data10; b[ 7] = data14; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; @@ -190,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; ao3 += 4; ao4 += 4; - + b += 16; } @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { if (m & 2) { @@ -222,14 +222,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data04; b[ 6] = data06; b[ 7] = data08; - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data03 = *(ao2 + 0); @@ -247,20 +247,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 1; b += 4; } - - } else + + } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } - + if (m & 1) { ao1 += lda; b += 4; } - + } else { #ifdef UNIT data05 = *(ao2 + 0); @@ -275,13 +275,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (i >= 3) { data15 = *(ao4 + 2); } - + b[ 0] = ONE; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = ONE; @@ -289,7 +289,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data14; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -313,13 +313,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data11 = *(ao3 + 2); data15 = *(ao4 + 2); } - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data09; b[ 3] = data13; b += 4; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = data06; @@ -327,7 +327,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data14; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -363,17 +363,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data05 = *(ao2 + 0); data06 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data05; b[ 2] = data02; b[ 3] = data06; - + ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -400,7 +400,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2 * lda; ao2 += 2 * lda; - + b += 4; } @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X < posY) { data01 = *(ao1 + 0); data05 = *(ao2 + 0); @@ -421,7 +421,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X > posY) { ao1 += lda; ao2 += lda; @@ -478,7 +478,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += lda; b += 1; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/trmm_uncopy_6.c b/kernel/generic/trmm_uncopy_6.c index 70945a246..4878f3f53 100644 --- a/kernel/generic/trmm_uncopy_6.c +++ b/kernel/generic/trmm_uncopy_6.c @@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao5 += 6; ao6 += 6; b += 36; - } else + } else if (X > posY) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -287,7 +287,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[33] = ZERO; b[34] = ZERO; b[35] = ONE; -#else +#else b[ 0] = data01; b[ 1] = data07; b[ 2] = data13; @@ -390,7 +390,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao3 += 4; ao4 += 4; b += 16; - } else + } else if (X > posY) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -544,7 +544,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 4; } - } else + } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; @@ -669,7 +669,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -717,7 +717,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X > posY) { ao1 += lda; ao2 += lda; diff --git a/kernel/generic/trmm_uncopy_8.c b/kernel/generic/trmm_uncopy_8.c index 4e23ffc69..ecfefd041 100644 --- a/kernel/generic/trmm_uncopy_8.c +++ b/kernel/generic/trmm_uncopy_8.c @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao7 = a + posY + (posX + 6) * lda; ao8 = a + posY + (posX + 7) * lda; } - + i = (m >> 3); if (i > 0) { do { @@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -111,7 +111,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); @@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); @@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); @@ -156,7 +156,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; @@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 64; - } else + } else if (X > posY) { ao1 += 8 * lda; ao2 += 8 * lda; @@ -250,7 +250,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 64; } else { @@ -258,12 +258,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifndef UNIT data01 = *(ao1 + 0); #endif - + data09 = *(ao2 + 0); #ifndef UNIT data10 = *(ao2 + 1); #endif - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); #ifndef UNIT @@ -276,7 +276,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifndef UNIT data28 = *(ao4 + 3); #endif - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifndef UNIT data37 = *(ao5 + 4); #endif - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); @@ -293,7 +293,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifndef UNIT data46 = *(ao6 + 5); #endif - + data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); @@ -303,7 +303,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifndef UNIT data55 = *(ao7 + 6); #endif - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); @@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifndef UNIT data64 = *(ao8 + 7); #endif - + #ifdef UNIT b[ 0] = ONE; @@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; - + b[ 8] = ZERO; #ifdef UNIT b[ 9] = ONE; @@ -354,7 +354,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data43; b[22] = data51; b[23] = data59; - + b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; @@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[37] = data45; b[38] = data53; b[39] = data61; - + b[40] = ZERO; b[41] = ZERO; b[42] = ZERO; @@ -419,7 +419,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else b[63] = data64; #endif - + ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; @@ -428,7 +428,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 64; } @@ -439,7 +439,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X < posY) { if (m & 4) { @@ -447,42 +447,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); data44 = *(ao6 + 3); - + data49 = *(ao7 + 0); data50 = *(ao7 + 1); data51 = *(ao7 + 2); data52 = *(ao7 + 3); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); data60 = *(ao8 + 3); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; @@ -491,7 +491,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; - + b[ 8] = data02; b[ 9] = data10; b[10] = data18; @@ -500,7 +500,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data42; b[14] = data50; b[15] = data58; - + b[16] = data03; b[17] = data11; b[18] = data19; @@ -509,7 +509,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data43; b[22] = data51; b[23] = data59; - + b[24] = data04; b[25] = data12; b[26] = data20; @@ -527,35 +527,35 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 4; ao7 += 4; ao8 += 4; - + b += 32; } - + if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); - + data49 = *(ao7 + 0); data50 = *(ao7 + 1); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; @@ -564,7 +564,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; - + b[ 8] = data02; b[ 9] = data10; b[10] = data18; @@ -573,7 +573,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data42; b[14] = data50; b[15] = data58; - + ao1 += 2; ao2 += 2; ao3 += 2; @@ -582,10 +582,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 2; ao7 += 2; ao8 += 2; - + b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); @@ -595,7 +595,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data41 = *(ao6 + 0); data49 = *(ao7 + 0); data57 = *(ao8 + 0); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; @@ -604,25 +604,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data41; b[ 6] = data49; b[ 7] = data57; - + b += 8; } - } else + } else if (X > posY) { if (m & 4) { ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 32; } - + if (m & 2) { ao1 += 2 * lda; b += 16; } - + if (m & 1) { b += 8; } @@ -661,7 +661,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data51 = *(ao7 + 2); data59 = *(ao8 + 2); } - + if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); @@ -709,7 +709,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = data49; b[ 7] = data57; b += 8; - + if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -725,7 +725,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data58; b += 8; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -741,8 +741,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data59; b += 8; } - - if (i >= 4) { + + if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; @@ -773,7 +773,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = data61; b += 8; } - + if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -837,37 +837,37 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; - + b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; - + b[ 8] = data03; b[ 9] = data11; b[10] = data19; b[11] = data27; - + b[12] = data04; b[13] = data12; b[14] = data20; @@ -880,7 +880,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 16; - } else + } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -906,7 +906,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data25; b[ 4] = ZERO; - b[ 5] = ONE; + b[ 5] = ONE; b[ 6] = data18; b[ 7] = data26; @@ -958,7 +958,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 16; } @@ -969,60 +969,60 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; - + b[ 4] = data02; b[ 5] = data10; b[ 6] = data18; b[ 7] = data26; - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; - + b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); data17 = *(ao3 + 0); data25 = *(ao4 + 0); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data17; b[ 3] = data25; - + b += 4; } - } else + } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; b += 8; } - + if (m & 1) { b += 4; } @@ -1049,7 +1049,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif data27 = *(ao4 + 2); } - + #ifdef UNIT b[ 0] = ONE; #else @@ -1059,7 +1059,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = data17; b[ 3] = data25; b += 4; - + if(i >= 2) { b[ 0] = ZERO; #ifdef UNIT @@ -1071,7 +1071,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = data26; b += 4; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -1109,7 +1109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data09; b[ 2] = data02; @@ -1119,7 +1119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -1156,15 +1156,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X < posY) { data01 = *(ao1 + 0); data09 = *(ao2 + 0); - + b[ 0] = data01; b[ 1] = data09; b += 2; - } else + } else if (X > posY) { b += 2; } else { @@ -1201,7 +1201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = data01; ao1 += 1; b += 1; - } else + } else if (X > posY) { ao1 += lda; b += 1; diff --git a/kernel/generic/trmm_utcopy_1.c b/kernel/generic/trmm_utcopy_1.c index 92f2da3da..86665e828 100644 --- a/kernel/generic/trmm_utcopy_1.c +++ b/kernel/generic/trmm_utcopy_1.c @@ -48,13 +48,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (n > 0) { X = posX; - + if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } - + i = m; if (m > 0) { do { @@ -77,7 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 1; ao1 += lda; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c index a964cd354..b83989f55 100644 --- a/kernel/generic/trmm_utcopy_16.c +++ b/kernel/generic/trmm_utcopy_16.c @@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 = a + posY + (posX + 14) * lda; a16 = a + posY + (posX + 15) * lda; } - + i = (m >> 4); if (i > 0) { do { @@ -110,11 +110,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 += 16; a16 += 16; b += 256; - } else + } else if (X > posY) { for (ii = 0; ii < 16; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -123,7 +123,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; b += 16; } @@ -174,7 +174,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; - + b[ 16] = *(a02 + 0); #ifdef UNIT b[ 17] = ONE; @@ -506,7 +506,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a14 += 16 * lda; a15 += 16 * lda; a16 += 16 * lda; - + b += 256; } @@ -535,11 +535,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a15 += i; a16 += i; b += 16 * i; - } else + } else if (X > posY) { - + for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -548,7 +548,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -557,7 +557,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; a02 += lda; a03 += lda; @@ -576,7 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a16 += lda; b += 16; } - + } else { #ifdef UNIT b[ 0] = ONE; @@ -598,7 +598,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; - + if (i >= 2) { b[ 0] = *(a02 + 0); #ifdef UNIT @@ -942,7 +942,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js --; } while (js > 0); } /* End of main loop */ - + if (n & 8){ X = posX; @@ -966,7 +966,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 = a + posY + (posX + 6) * lda; a08 = a + posY + (posX + 7) * lda; } - + i = (m >> 3); if (i > 0) { do { @@ -980,11 +980,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += 8; a08 += 8; b += 64; - } else + } else if (X > posY) { - + for (ii = 0; ii < 8; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -993,7 +993,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + a01 += lda; b += 8; } @@ -1018,7 +1018,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = *(a02 + 0); #ifdef UNIT b[ 9] = ONE; @@ -1121,7 +1121,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 64; } - + X += 8; i --; } while (i > 0); @@ -1139,10 +1139,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += i; a08 += i; b += 8 * i; - } else + } else if (X > posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -1151,11 +1151,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + a01 += lda; b += 8; } - + a02 += i * lda; a03 += i * lda; a04 += i * lda; @@ -1177,7 +1177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = ZERO; b[ 7] = ZERO; b += 8; - + if (i >= 2) { b[ 0] = *(a02 + 0); #ifdef UNIT @@ -1292,7 +1292,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 = a + posY + (posX + 2) * lda; a04 = a + posY + (posX + 3) * lda; } - + i = (m >> 2); if (i > 0) { do { @@ -1302,11 +1302,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += 4; a04 += 4; b += 16; - } else + } else if (X > posY) { - + for (ii = 0; ii < 4; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -1328,7 +1328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = ZERO; b[ 2] = ZERO; b[ 3] = ZERO; - + b[ 4] = *(a02 + 0); #ifdef UNIT b[ 5] = ONE; @@ -1346,7 +1346,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 10] = *(a03 + 2); #endif b[ 11] = ZERO; - + b[ 12] = *(a04 + 0); b[ 13] = *(a04 + 1); b[ 14] = *(a04 + 2); @@ -1362,7 +1362,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a04 += 4 * lda; b += 16; } - + X += 4; i --; } while (i > 0); @@ -1376,10 +1376,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += i; a04 += i; b += 4 * i; - } else + } else if (X > posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -1391,7 +1391,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += lda; a04 += lda; } else { - + #ifdef UNIT b[ 0] = ONE; #else @@ -1401,7 +1401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - + if (i >= 2) { b[ 0] = *(a02 + 0); #ifdef UNIT @@ -1440,7 +1440,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 = a + posY + (posX + 0) * lda; a02 = a + posY + (posX + 1) * lda; } - + i = (m >> 1); if (i > 0) { do { @@ -1448,7 +1448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += 2; a02 += 2; b += 4; - } else + } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -1465,7 +1465,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 0] = *(a01 + 0); #endif b[ 1] = ZERO; - + b[ 2] = *(a02 + 0); #ifdef UNIT b[ 3] = ONE; @@ -1477,7 +1477,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a02 += 2 * lda; b += 4; } - + X += 2; i --; } while (i > 0); @@ -1488,7 +1488,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 ++; a02 ++; b += 2; - } else + } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -1507,7 +1507,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } posY += 2; } - + if (n & 1){ X = posX; @@ -1517,14 +1517,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { a01 = a + posY + (posX + 0) * lda; } - + i = m; if (i > 0) { do { if (X < posY) { a01 += 1; b ++; - } else + } else if (X > posY) { b[ 0] = *(a01 + 0); a01 += lda; @@ -1538,7 +1538,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += lda; b ++; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c index 620b06a4f..ae4a19e32 100644 --- a/kernel/generic/trmm_utcopy_2.c +++ b/kernel/generic/trmm_utcopy_2.c @@ -69,13 +69,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -86,9 +86,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 4; } else { -#ifdef UNIT +#ifdef UNIT data03 = *(ao2 + 0); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; @@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data01 = *(ao1 + 0); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = ZERO; b[ 2] = data03; @@ -115,27 +115,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X < posY) { ao1 += 1; ao2 += 1; b += 2; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; ao1 += lda; b += 2; } else { -#ifdef UNIT +#ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else data01 = *(ao1 + 0); - + b[ 0] = data01; b[ 1] = ZERO; #endif @@ -151,13 +151,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1){ X = posX; - + if (posX <= posY) { ao1 = a + posX + (posY + 0) * lda; } else { ao1 = a + posY + (posX + 0) * lda; } - + i = m; if (m > 0) { do { @@ -180,7 +180,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 1; ao1 += lda; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c index 7d4dba34b..441f7338b 100644 --- a/kernel/generic/trmm_utcopy_4.c +++ b/kernel/generic/trmm_utcopy_4.c @@ -75,28 +75,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao3 += 4; ao4 += 4; b += 16; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -122,14 +122,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 16; } else { -#ifdef UNIT +#ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = data10; b[10] = ONE; @@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; @@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 16; } @@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { if (m & 2) { @@ -207,7 +207,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - + if (m & 1) { ao1 += 1; ao2 += 1; @@ -215,8 +215,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 1; b += 4; } - - } else + + } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -227,7 +227,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -236,30 +236,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + ao1 += lda; b += 4; } - + } else { -#ifdef UNIT +#ifdef UNIT if (i >= 2) { data05 = *(ao2 + 0); } @@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - + if(i >= 2) { b[ 0] = data05; b[ 1] = ONE; @@ -282,7 +282,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = ZERO; b += 4; } - + if (i >= 3) { b[ 0] = data09; b[ 1] = data10; @@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - + if(i >= 2) { b[ 0] = data05; b[ 1] = data06; @@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = ZERO; b += 4; } - + if (i >= 3) { b[ 0] = data09; b[ 1] = data10; @@ -353,7 +353,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -399,18 +399,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X < posY) { ao1 += 2; b += 2; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; - + ao1 += lda; b += 2; } else { diff --git a/kernel/generic/trmm_utcopy_6.c b/kernel/generic/trmm_utcopy_6.c index 7d4dba34b..441f7338b 100644 --- a/kernel/generic/trmm_utcopy_6.c +++ b/kernel/generic/trmm_utcopy_6.c @@ -75,28 +75,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao3 += 4; ao4 += 4; b += 16; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data05 = *(ao2 + 0); data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + data09 = *(ao3 + 0); data10 = *(ao3 + 1); data11 = *(ao3 + 2); data12 = *(ao3 + 3); - + data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -122,14 +122,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 16; } else { -#ifdef UNIT +#ifdef UNIT data05 = *(ao2 + 0); data09 = *(ao3 + 0); data10 = *(ao3 + 1); data13 = *(ao4 + 0); data14 = *(ao4 + 1); data15 = *(ao4 + 2); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ONE; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = data10; b[10] = ONE; @@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao4 + 1); data15 = *(ao4 + 2); data16 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = ZERO; b[ 2] = ZERO; @@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 16; } @@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { if (m & 2) { @@ -207,7 +207,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - + if (m & 1) { ao1 += 1; ao2 += 1; @@ -215,8 +215,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 1; b += 4; } - - } else + + } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -227,7 +227,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -236,30 +236,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + ao1 += 2 * lda; ao2 += 2 * lda; b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + ao1 += lda; b += 4; } - + } else { -#ifdef UNIT +#ifdef UNIT if (i >= 2) { data05 = *(ao2 + 0); } @@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - + if(i >= 2) { b[ 0] = data05; b[ 1] = ONE; @@ -282,7 +282,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = ZERO; b += 4; } - + if (i >= 3) { b[ 0] = data09; b[ 1] = data10; @@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - + if(i >= 2) { b[ 0] = data05; b[ 1] = data06; @@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = ZERO; b += 4; } - + if (i >= 3) { b[ 0] = data09; b[ 1] = data10; @@ -353,7 +353,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -399,18 +399,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X < posY) { ao1 += 2; b += 2; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; - + ao1 += lda; b += 2; } else { diff --git a/kernel/generic/trmm_utcopy_8.c b/kernel/generic/trmm_utcopy_8.c index 6dbf8bd28..65fee357b 100644 --- a/kernel/generic/trmm_utcopy_8.c +++ b/kernel/generic/trmm_utcopy_8.c @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao7 = a + posY + (posX + 6) * lda; ao8 = a + posY + (posX + 7) * lda; } - + i = (m >> 3); if (i > 0) { do { @@ -95,7 +95,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao8 += 8; b += 64; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -114,7 +114,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -123,7 +123,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); @@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data38 = *(ao5 + 5); data39 = *(ao5 + 6); data40 = *(ao5 + 7); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); @@ -159,7 +159,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data54 = *(ao7 + 5); data55 = *(ao7 + 6); data56 = *(ao7 + 7); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); @@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data62 = *(ao8 + 5); data63 = *(ao8 + 6); data64 = *(ao8 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -195,7 +195,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -213,7 +213,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[37] = data38; b[38] = data39; b[39] = data40; - + b[40] = data41; b[41] = data42; b[42] = data43; @@ -240,7 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[61] = data62; b[62] = data63; b[63] = data64; - + ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; @@ -249,25 +249,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 64; } else { #ifdef UNIT data09 = *(ao2 + 0); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); @@ -280,7 +280,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data52 = *(ao7 + 3); data53 = *(ao7 + 4); data54 = *(ao7 + 5); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); @@ -297,7 +297,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = ONE; b[10] = ZERO; @@ -315,7 +315,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[37] = ZERO; b[38] = ZERO; b[39] = ZERO; - + b[40] = data41; b[41] = data42; b[42] = data43; @@ -365,22 +365,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); - + data33 = *(ao5 + 0); data34 = *(ao5 + 1); data35 = *(ao5 + 2); data36 = *(ao5 + 3); data37 = *(ao5 + 4); - + data41 = *(ao6 + 0); data42 = *(ao6 + 1); data43 = *(ao6 + 2); @@ -395,7 +395,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data53 = *(ao7 + 4); data54 = *(ao7 + 5); data55 = *(ao7 + 6); - + data57 = *(ao8 + 0); data58 = *(ao8 + 1); data59 = *(ao8 + 2); @@ -413,7 +413,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = data10; b[10] = ZERO; @@ -431,7 +431,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -449,7 +449,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[37] = ZERO; b[38] = ZERO; b[39] = ZERO; - + b[40] = data41; b[41] = data42; b[42] = data43; @@ -486,7 +486,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 64; } @@ -497,7 +497,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X < posY) { if (m & 4) { @@ -509,10 +509,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 4; ao7 += 4; ao8 += 4; - + b += 32; } - + if (m & 2) { ao1 += 2; ao2 += 2; @@ -522,14 +522,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 2; ao7 += 2; ao8 += 2; - + b += 16; } - + if (m & 1) { b += 8; } - } else + } else if (X > posY) { if (m & 4) { data01 = *(ao1 + 0); @@ -540,7 +540,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -549,7 +549,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -558,7 +558,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -567,7 +567,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -576,7 +576,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -585,7 +585,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + b[16] = data17; b[17] = data18; b[18] = data19; @@ -594,7 +594,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -603,15 +603,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[29] = data30; b[30] = data31; b[31] = data32; - + ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 32; } - + if (m & 2) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -621,7 +621,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -630,7 +630,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -639,7 +639,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -648,11 +648,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + ao1 += 2 * lda; b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -671,7 +671,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b += 8; } } else { @@ -709,7 +709,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data51 = *(ao7 + 2); data59 = *(ao8 + 2); } - + if (i >= 4) { #ifndef UNIT data28 = *(ao4 + 3); @@ -757,7 +757,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = ZERO; b[ 7] = ZERO; b += 8; - + if(i >= 2) { b[ 0] = data09; #ifdef UNIT @@ -773,7 +773,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - + if (i >= 3) { b[ 0] = data17; b[ 1] = data18; @@ -789,8 +789,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - - if (i >= 4) { + + if (i >= 4) { b[ 0] = data25; b[ 1] = data26; b[ 2] = data27; @@ -821,7 +821,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - + if (i >= 6) { b[ 0] = data41; b[ 1] = data42; @@ -888,7 +888,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 16; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -914,7 +914,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -1007,7 +1007,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 16; } @@ -1018,7 +1018,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { if (m & 2) { @@ -1026,14 +1026,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; ao3 += 2; ao4 += 2; - + b += 8; } - + if (m & 1) { b += 4; } - } else + } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -1045,7 +1045,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -1054,11 +1054,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; - + ao1 += 2 * lda; b += 8; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -1069,7 +1069,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + b += 4; } } else { @@ -1095,7 +1095,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif data27 = *(ao4 + 2); } - + #ifndef UNIT b[ 0] = ONE; #else @@ -1105,7 +1105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - + if(i >= 2) { b[ 0] = data09; #ifndef UNIT @@ -1117,10 +1117,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 3] = ZERO; b += 4; } - + if (i >= 3) { b[ 0] = data17; - + b[ 1] = data18; #ifndef UNIT b[ 2] = ONE; @@ -1155,7 +1155,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 2; b += 4; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -1164,7 +1164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2 * lda; ao2 += 2 * lda; - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -1204,10 +1204,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X < posY) { b += 2; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -1247,11 +1247,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { ao1 += 1; b += 1; - } else + } else if (X > posY) { data01 = *(ao1 + 0); ao1 += lda; - + b[ 0] = data01; b += 1; diff --git a/kernel/generic/trmmkernel_16x2.c b/kernel/generic/trmmkernel_16x2.c index 437fa0950..078a91dd5 100644 --- a/kernel/generic/trmmkernel_16x2.c +++ b/kernel/generic/trmmkernel_16x2.c @@ -1,6 +1,6 @@ #include "common.h" -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) { BLASLONG i,j,k; @@ -51,12 +51,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL BLASLONG off, temp; #if !defined(LEFT) - off = -offset; + off = -offset; #endif - for (j=0; j= 0; i--) { aa = *(a + i); - + for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; @@ -141,7 +141,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); - + for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); @@ -181,7 +181,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B #endif -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif @@ -197,33 +197,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #endif j = (n >> GEMM_UNROLL_N_SHIFT); - + while (j > 0) { kk = m + offset; - + if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; - + if (k - kk > 0) { - GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, - b + GEMM_UNROLL_N * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, cc, - ldc); + ldc); } - solve(i, GEMM_UNROLL_N, - aa + (kk - i) * i * COMPSIZE, + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); - + kk -= i; } } @@ -236,102 +236,102 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, do { if (k - kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, - b + GEMM_UNROLL_N * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, cc, - ldc); + ldc); } - solve(GEMM_UNROLL_M, GEMM_UNROLL_N, - aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); - + aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } - + b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; } - + if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { - + kk = m + offset; - + if (m & (GEMM_UNROLL_M - 1)) { for (i = 1; i < GEMM_UNROLL_M; i *= 2){ if (m & i) { aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; - + if (k - kk > 0) { - GEMM_KERNEL(i, j, k - kk, dm1, + GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, - b + j * kk * COMPSIZE, - cc, ldc); + b + j * kk * COMPSIZE, + cc, ldc); } - solve(i, j, + solve(i, j, aa + (kk - i) * i * COMPSIZE, b + (kk - i) * j * COMPSIZE, cc, ldc); - + kk -= i; } } } - + i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; - + do { if (k - kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, - b + j * kk * COMPSIZE, + b + j * kk * COMPSIZE, cc, - ldc); + ldc); } - solve(GEMM_UNROLL_M, j, + solve(GEMM_UNROLL_M, j, aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, cc, ldc); - + aa -= GEMM_UNROLL_M * k * COMPSIZE; cc -= GEMM_UNROLL_M * COMPSIZE; kk -= GEMM_UNROLL_M; i --; } while (i > 0); } - + b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } j >>= 1; } } - + return 0; } diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c index 099624252..07b33467e 100644 --- a/kernel/generic/trsm_kernel_LT.c +++ b/kernel/generic/trsm_kernel_LT.c @@ -101,7 +101,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (i = 0; i < m; i++) { aa = *(a + i); - + for (j = 0; j < n; j ++) { bb = *(c + i + j * ldc); bb *= aa; @@ -134,7 +134,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B aa1 = *(a + i * 2 + 0); aa2 = *(a + i * 2 + 1); - + for (j = 0; j < n; j ++) { bb1 = *(c + i * 2 + 0 + j * ldc); bb2 = *(c + i * 2 + 1 + j * ldc); @@ -191,24 +191,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, j = (n >> GEMM_UNROLL_N_SHIFT); while (j > 0) { - + kk = offset; aa = a; cc = c; - + i = (m >> GEMM_UNROLL_M_SHIFT); - + while (i > 0) { if (kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif - aa, b, cc, ldc); + aa, b, cc, ldc); } - solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); @@ -218,19 +218,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, kk += GEMM_UNROLL_M; i --; } - + if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { - GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif - aa, b, cc, ldc); + aa, b, cc, ldc); } - solve(i, GEMM_UNROLL_N, + solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); @@ -242,39 +242,39 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i >>= 1; } } - + b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } - + if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { - + kk = offset; aa = a; cc = c; - + i = (m >> GEMM_UNROLL_M_SHIFT); - + while (i > 0) { if (kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, - b, + b, cc, - ldc); + ldc); } - solve(GEMM_UNROLL_M, j, - aa + kk * GEMM_UNROLL_M * COMPSIZE, + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; @@ -282,24 +282,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, kk += GEMM_UNROLL_M; i --; } - + if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { - GEMM_KERNEL(i, j, kk, dm1, + GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, - b, + b, cc, - ldc); + ldc); } - solve(i, j, - aa + kk * i * COMPSIZE, + solve(i, j, + aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; @@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i >>= 1; } } - + b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; } diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c index d7e650e0c..07a4f3b40 100644 --- a/kernel/generic/trsm_kernel_RN.c +++ b/kernel/generic/trsm_kernel_RN.c @@ -101,7 +101,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (i = 0; i < n; i++) { bb = *(b + i); - + for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; @@ -134,7 +134,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); - + for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); aa2 = *(c + j * 2 + 1 + i * ldc); @@ -171,7 +171,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B #endif -int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, #ifdef COMPLEX FLOAT dummy2, #endif @@ -191,46 +191,46 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, kk = -offset; while (j > 0) { - + aa = a; cc = c; - + i = (m >> GEMM_UNROLL_M_SHIFT); - + if (i > 0) { do { if (kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif - aa, b, cc, ldc); + aa, b, cc, ldc); } - solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); - + aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } while (i > 0); } - + if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { - GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, #ifdef COMPLEX ZERO, #endif - aa, b, cc, ldc); + aa, b, cc, ldc); } - solve(i, GEMM_UNROLL_N, + solve(i, GEMM_UNROLL_N, aa + kk * i * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); @@ -241,63 +241,63 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i >>= 1; } } - + kk += GEMM_UNROLL_N; b += GEMM_UNROLL_N * k * COMPSIZE; c += GEMM_UNROLL_N * ldc * COMPSIZE; j --; jj += GEMM_UNROLL_M; } - + if (n & (GEMM_UNROLL_N - 1)) { j = (GEMM_UNROLL_N >> 1); while (j > 0) { if (n & j) { - + aa = a; cc = c; - + i = (m >> GEMM_UNROLL_M_SHIFT); - + while (i > 0) { if (kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, - b, + b, cc, - ldc); + ldc); } - solve(GEMM_UNROLL_M, j, - aa + kk * GEMM_UNROLL_M * COMPSIZE, + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; } - + if (m & (GEMM_UNROLL_M - 1)) { i = (GEMM_UNROLL_M >> 1); while (i > 0) { if (m & i) { if (kk > 0) { - GEMM_KERNEL(i, j, kk, dm1, + GEMM_KERNEL(i, j, kk, dm1, #ifdef COMPLEX ZERO, #endif aa, - b, + b, cc, - ldc); + ldc); } - solve(i, j, - aa + kk * i * COMPSIZE, + solve(i, j, + aa + kk * i * COMPSIZE, b + kk * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; @@ -306,7 +306,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i >>= 1; } } - + b += j * k * COMPSIZE; c += j * ldc * COMPSIZE; kk += j; diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c index a46945330..0c4db335c 100644 --- a/kernel/generic/trsm_kernel_RT.c +++ b/kernel/generic/trsm_kernel_RT.c @@ -106,7 +106,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B for (i = n - 1; i >= 0; i--) { bb = *(b + i); - + for (j = 0; j < m; j ++) { aa = *(c + j + i * ldc); aa *= bb; @@ -144,7 +144,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B bb1 = *(b + i * 2 + 0); bb2 = *(b + i * 2 + 1); - + for (j = 0; j < m; j ++) { aa1 = *(c + j * 2 + 0 + i * ldc); @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; - + #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", m, n, k, offset); @@ -208,32 +208,32 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, j = 1; while (j < GEMM_UNROLL_N) { if (n & j) { - + aa = a; b -= j * k * COMPSIZE; c -= j * ldc* COMPSIZE; cc = c; - + i = (m >> GEMM_UNROLL_M_SHIFT); if (i > 0) { do { if (k - kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, - b + j * kk * COMPSIZE, + b + j * kk * COMPSIZE, cc, - ldc); + ldc); } - solve(GEMM_UNROLL_M, j, + solve(GEMM_UNROLL_M, j, aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); - + aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; @@ -246,23 +246,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, if (m & i) { if (k - kk > 0) { - GEMM_KERNEL(i, j, k - kk, dm1, + GEMM_KERNEL(i, j, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, - b + j * kk * COMPSIZE, - cc, ldc); + b + j * kk * COMPSIZE, + cc, ldc); } - solve(i, j, + solve(i, j, aa + (kk - j) * i * COMPSIZE, b + (kk - j) * j * COMPSIZE, cc, ldc); aa += i * k * COMPSIZE; cc += i * COMPSIZE; - + } i >>= 1; } while (i > 0); @@ -287,21 +287,21 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, if (i > 0) { do { if (k - kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + GEMM_UNROLL_M * kk * COMPSIZE, - b + GEMM_UNROLL_N * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, cc, - ldc); + ldc); } - - solve(GEMM_UNROLL_M, GEMM_UNROLL_N, - aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); - + aa += GEMM_UNROLL_M * k * COMPSIZE; cc += GEMM_UNROLL_M * COMPSIZE; i --; @@ -313,28 +313,28 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, do { if (m & i) { if (k - kk > 0) { - GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif aa + i * kk * COMPSIZE, - b + GEMM_UNROLL_N * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, cc, - ldc); + ldc); } - - solve(i, GEMM_UNROLL_N, - aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); - + aa += i * k * COMPSIZE; cc += i * COMPSIZE; } i >>= 1; } while (i > 0); } - + kk -= GEMM_UNROLL_N; j --; } while (j > 0); diff --git a/kernel/generic/trsm_lncopy_1.c b/kernel/generic/trsm_lncopy_1.c index abad971a6..13c88adad 100644 --- a/kernel/generic/trsm_lncopy_1.c +++ b/kernel/generic/trsm_lncopy_1.c @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } if (ii > jj) *(b + 0) = *(a1 + 0); - + a1 ++; b ++; diff --git a/kernel/generic/trsm_lncopy_16.c b/kernel/generic/trsm_lncopy_16.c index a7f9cb0b3..9754e677b 100644 --- a/kernel/generic/trsm_lncopy_16.c +++ b/kernel/generic/trsm_lncopy_16.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; - + jj = offset; j = (n >> 4); @@ -78,14 +78,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 16)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } - + if (ii - jj >= 16) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -143,14 +143,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } - + if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -187,14 +187,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } - + if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } - + if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -249,14 +249,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k * lda); } *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); } - + if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); } diff --git a/kernel/generic/trsm_lncopy_2.c b/kernel/generic/trsm_lncopy_2.c index 20cc64253..69bfbea15 100644 --- a/kernel/generic/trsm_lncopy_2.c +++ b/kernel/generic/trsm_lncopy_2.c @@ -91,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data04; } - + a1 += 2; a2 += 2; b += 4; @@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1+= 1; b += 1; i --; diff --git a/kernel/generic/trsm_lncopy_4.c b/kernel/generic/trsm_lncopy_4.c index 9f7bcc2dd..a37c50d1f 100644 --- a/kernel/generic/trsm_lncopy_4.c +++ b/kernel/generic/trsm_lncopy_4.c @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data12; *(b + 15) = data16; } - + a1 += 4; a2 += 4; a3 += 4; @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data06; *(b + 7) = data08; } - + a1 += 2; a2 += 2; a3 += 2; @@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data04; } - + a1 += 2; a2 += 2; b += 4; @@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1+= 1; b += 1; i --; diff --git a/kernel/generic/trsm_lncopy_6.c b/kernel/generic/trsm_lncopy_6.c index 9f7bcc2dd..a37c50d1f 100644 --- a/kernel/generic/trsm_lncopy_6.c +++ b/kernel/generic/trsm_lncopy_6.c @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data12; *(b + 15) = data16; } - + a1 += 4; a2 += 4; a3 += 4; @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data06; *(b + 7) = data08; } - + a1 += 2; a2 += 2; a3 += 2; @@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data04; } - + a1 += 2; a2 += 2; b += 4; @@ -314,7 +314,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1+= 1; b += 1; i --; diff --git a/kernel/generic/trsm_lncopy_8.c b/kernel/generic/trsm_lncopy_8.c index 40feb810f..ca019fc0c 100644 --- a/kernel/generic/trsm_lncopy_8.c +++ b/kernel/generic/trsm_lncopy_8.c @@ -140,7 +140,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #endif *(b + 0) = INV(data01); - + *(b + 8) = data02; *(b + 9) = INV(data10); @@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 25) = data12; *(b + 26) = data20; *(b + 27) = INV(data28); - + *(b + 32) = data05; *(b + 33) = data13; *(b + 34) = data21; @@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 43) = data30; *(b + 44) = data38; *(b + 45) = INV(data46); - + *(b + 48) = data07; *(b + 49) = data15; *(b + 50) = data23; @@ -265,7 +265,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; - + *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; @@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; - + *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; @@ -283,7 +283,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; - + *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; @@ -292,7 +292,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; - + *(b + 32) = data05; *(b + 33) = data13; *(b + 34) = data21; @@ -310,7 +310,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 45) = data46; *(b + 46) = data54; *(b + 47) = data62; - + *(b + 48) = data07; *(b + 49) = data15; *(b + 50) = data23; @@ -329,7 +329,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 62) = data56; *(b + 63) = data64; } - + a1 += 8; a2 += 8; a3 += 8; @@ -370,7 +370,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #endif *(b + 0) = INV(data01); - + *(b + 8) = data02; *(b + 9) = INV(data10); @@ -429,7 +429,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; - + *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; @@ -438,7 +438,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; - + *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; @@ -447,7 +447,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; - + *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; @@ -456,9 +456,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; - + } - + a1 += 4; a2 += 4; a3 += 4; @@ -483,7 +483,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #endif *(b + 0) = INV(data01); - + *(b + 8) = data02; *(b + 9) = INV(data10); } @@ -515,7 +515,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; - + *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; @@ -525,7 +525,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data50; *(b + 15) = data58; } - + a1 += 2; a2 += 2; a3 += 2; @@ -608,7 +608,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #endif *(b + 0) = INV(data01); - + *(b + 4) = data02; *(b + 5) = INV(data10); @@ -652,18 +652,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data10; *(b + 6) = data18; *(b + 7) = data26; - + *(b + 8) = data03; *(b + 9) = data11; *(b + 10) = data19; *(b + 11) = data27; - + *(b + 12) = data04; *(b + 13) = data12; *(b + 14) = data20; *(b + 15) = data28; } - + a1 += 4; a2 += 4; a3 += 4; @@ -686,7 +686,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #endif *(b + 0) = INV(data01); - + *(b + 4) = data02; *(b + 5) = INV(data10); } @@ -710,7 +710,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data18; *(b + 7) = data26; } - + a1 += 2; a2 += 2; a3 += 2; @@ -779,7 +779,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data10; } - + a1 += 2; a2 += 2; b += 4; @@ -828,7 +828,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1; b += 1; diff --git a/kernel/generic/trsm_ltcopy_16.c b/kernel/generic/trsm_ltcopy_16.c index 1203f1bfa..42618c215 100644 --- a/kernel/generic/trsm_ltcopy_16.c +++ b/kernel/generic/trsm_ltcopy_16.c @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 16)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -107,17 +107,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 8)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); - + for (k = ii - jj + 1; k < 8; k ++) { *(b + k) = *(a1 + k); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); @@ -154,7 +154,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -178,7 +178,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); @@ -188,7 +188,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -209,11 +209,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { *(b + ii - jj) = INV(*(a1 + ii - jj)); } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); } diff --git a/kernel/generic/trsm_ltcopy_2.c b/kernel/generic/trsm_ltcopy_2.c index 470563517..9f48e8414 100644 --- a/kernel/generic/trsm_ltcopy_2.c +++ b/kernel/generic/trsm_ltcopy_2.c @@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); - + *(b + 0) = INV(data01); *(b + 1) = data02; } @@ -147,7 +147,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1 * lda; b += 1; diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c index d891468a4..12043eb33 100644 --- a/kernel/generic/trsm_ltcopy_4.c +++ b/kernel/generic/trsm_ltcopy_4.c @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -159,25 +159,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if ((m & 2) != 0) { if (ii== jj) { - + #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); - + #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); - + *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; - + *(b + 5) = INV(data06); *(b + 6) = data07; *(b + 7) = data08; @@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -222,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); - + *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1 * lda; b += 1; diff --git a/kernel/generic/trsm_ltcopy_6.c b/kernel/generic/trsm_ltcopy_6.c index d891468a4..12043eb33 100644 --- a/kernel/generic/trsm_ltcopy_6.c +++ b/kernel/generic/trsm_ltcopy_6.c @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -159,25 +159,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if ((m & 2) != 0) { if (ii== jj) { - + #ifndef UNIT data01 = *(a1 + 0); #endif data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); - + #ifndef UNIT data06 = *(a2 + 1); #endif data07 = *(a2 + 2); data08 = *(a2 + 3); - + *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; *(b + 3) = data04; - + *(b + 5) = INV(data06); *(b + 6) = data07; *(b + 7) = data08; @@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -222,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data02 = *(a1 + 1); data03 = *(a1 + 2); data04 = *(a1 + 3); - + *(b + 0) = INV(data01); *(b + 1) = data02; *(b + 2) = data03; @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1 * lda; b += 1; diff --git a/kernel/generic/trsm_ltcopy_8.c b/kernel/generic/trsm_ltcopy_8.c index 0925dccd5..9d64e263c 100644 --- a/kernel/generic/trsm_ltcopy_8.c +++ b/kernel/generic/trsm_ltcopy_8.c @@ -326,7 +326,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 62) = data63; *(b + 63) = data64; } - + a1 += 8 * lda; a2 += 8 * lda; a3 += 8 * lda; @@ -484,7 +484,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 30) = data31; *(b + 31) = data32; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -572,7 +572,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 2 * lda; a2 += 2 * lda; b += 16; @@ -720,7 +720,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data27; *(b + 15) = data28; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -777,7 +777,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data11; *(b + 7) = data12; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -854,7 +854,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data09; *(b + 3) = data10; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -907,7 +907,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += lda; b += 1; diff --git a/kernel/generic/trsm_uncopy_1.c b/kernel/generic/trsm_uncopy_1.c index 3a258609e..ee06c4eac 100644 --- a/kernel/generic/trsm_uncopy_1.c +++ b/kernel/generic/trsm_uncopy_1.c @@ -74,7 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } if (ii < jj) *(b + 0) = *(a1 + 0); - + a1 ++; b ++; i --; diff --git a/kernel/generic/trsm_uncopy_16.c b/kernel/generic/trsm_uncopy_16.c index e2b8ce49c..b0480ce06 100644 --- a/kernel/generic/trsm_uncopy_16.c +++ b/kernel/generic/trsm_uncopy_16.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; - + jj = offset; j = (n >> 4); @@ -78,14 +78,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 16)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 16; k ++) { *(b + k) = *(a1 + k * lda); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -143,14 +143,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 8)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 8; k ++) { *(b + k) = *(a1 + k * lda); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -187,14 +187,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 4; k ++) { *(b + k) = *(a1 + k * lda); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 2; k ++) { *(b + k) = *(a1 + k * lda); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a2 + 0); @@ -249,14 +249,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); for (k = ii - jj + 1; k < 1; k ++) { *(b + k) = *(a1 + k * lda); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); } diff --git a/kernel/generic/trsm_uncopy_2.c b/kernel/generic/trsm_uncopy_2.c index f7f3435f9..6c257eeb1 100644 --- a/kernel/generic/trsm_uncopy_2.c +++ b/kernel/generic/trsm_uncopy_2.c @@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data04; } - + a1 += 2; a2 += 2; b += 4; @@ -148,7 +148,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1+= 1; b += 1; i --; diff --git a/kernel/generic/trsm_uncopy_4.c b/kernel/generic/trsm_uncopy_4.c index 837a25019..a1bb1e203 100644 --- a/kernel/generic/trsm_uncopy_4.c +++ b/kernel/generic/trsm_uncopy_4.c @@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data12; *(b + 15) = data16; } - + a1 += 4; a2 += 4; a3 += 4; @@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if ((m & 2) != 0) { if (ii== jj) { - + #ifndef UNIT data01 = *(a1 + 0); #endif @@ -205,7 +205,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2; a2 += 2; b += 8; @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data04; } - + a1 += 2; a2 += 2; b += 4; @@ -338,7 +338,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1+= 1; b += 1; i --; diff --git a/kernel/generic/trsm_uncopy_6.c b/kernel/generic/trsm_uncopy_6.c index 837a25019..a1bb1e203 100644 --- a/kernel/generic/trsm_uncopy_6.c +++ b/kernel/generic/trsm_uncopy_6.c @@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data12; *(b + 15) = data16; } - + a1 += 4; a2 += 4; a3 += 4; @@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if ((m & 2) != 0) { if (ii== jj) { - + #ifndef UNIT data01 = *(a1 + 0); #endif @@ -205,7 +205,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2; a2 += 2; b += 8; @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data04; } - + a1 += 2; a2 += 2; b += 4; @@ -338,7 +338,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1+= 1; b += 1; i --; diff --git a/kernel/generic/trsm_uncopy_8.c b/kernel/generic/trsm_uncopy_8.c index 8c5623dff..40903d44f 100644 --- a/kernel/generic/trsm_uncopy_8.c +++ b/kernel/generic/trsm_uncopy_8.c @@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; - + *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; @@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; - + *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; - + *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; @@ -293,7 +293,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 29) = data44; *(b + 30) = data52; *(b + 31) = data60; - + *(b + 32) = data05; *(b + 33) = data13; *(b + 34) = data21; @@ -311,7 +311,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 45) = data46; *(b + 46) = data54; *(b + 47) = data62; - + *(b + 48) = data07; *(b + 49) = data15; *(b + 50) = data23; @@ -330,7 +330,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 62) = data56; *(b + 63) = data64; } - + a1 += 8; a2 += 8; a3 += 8; @@ -467,7 +467,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; - + *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; @@ -476,7 +476,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 13) = data42; *(b + 14) = data50; *(b + 15) = data58; - + *(b + 16) = data03; *(b + 17) = data11; *(b + 18) = data19; @@ -485,7 +485,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 21) = data43; *(b + 22) = data51; *(b + 23) = data59; - + *(b + 24) = data04; *(b + 25) = data12; *(b + 26) = data20; @@ -495,7 +495,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 30) = data52; *(b + 31) = data60; } - + a1 += 4; a2 += 4; a3 += 4; @@ -579,7 +579,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data41; *(b + 6) = data49; *(b + 7) = data57; - + *(b + 8) = data02; *(b + 9) = data10; *(b + 10) = data18; @@ -589,7 +589,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data50; *(b + 15) = data58; } - + a1 += 2; a2 += 2; a3 += 2; @@ -732,7 +732,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 5) = data10; *(b + 6) = data18; *(b + 7) = data26; - + *(b + 8) = data03; *(b + 9) = data11; *(b + 10) = data19; @@ -742,7 +742,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data20; *(b + 15) = data28; } - + a1 += 4; a2 += 4; a3 += 4; @@ -798,7 +798,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data18; *(b + 7) = data26; } - + a1 += 2; a2 += 2; a3 += 2; @@ -879,7 +879,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data02; *(b + 3) = data10; } - + a1 += 2; a2 += 2; b += 4; @@ -934,7 +934,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1; b += 1; i --; diff --git a/kernel/generic/trsm_utcopy_1.c b/kernel/generic/trsm_utcopy_1.c index ea490d531..cad8180e7 100644 --- a/kernel/generic/trsm_utcopy_1.c +++ b/kernel/generic/trsm_utcopy_1.c @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } if (ii > jj) *(b + 0) = *(a1 + 0); - + a1 += lda; b ++; i --; diff --git a/kernel/generic/trsm_utcopy_16.c b/kernel/generic/trsm_utcopy_16.c index 546641242..741fcde72 100644 --- a/kernel/generic/trsm_utcopy_16.c +++ b/kernel/generic/trsm_utcopy_16.c @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 16)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + ii - jj) = INV(*(a1 + ii - jj)); } - + if (ii - jj >= 16) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); @@ -113,7 +113,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + ii - jj) = INV(*(a1 + ii - jj)); } - + if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); @@ -149,7 +149,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + ii - jj) = INV(*(a1 + ii - jj)); } - + if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -173,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); @@ -181,7 +181,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + ii - jj) = INV(*(a1 + ii - jj)); } - + if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k) = *(a1 + k); @@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + ii - jj) = INV(*(a1 + ii - jj)); } - + if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); } diff --git a/kernel/generic/trsm_utcopy_2.c b/kernel/generic/trsm_utcopy_2.c index 3def611eb..bdd5416ca 100644 --- a/kernel/generic/trsm_utcopy_2.c +++ b/kernel/generic/trsm_utcopy_2.c @@ -91,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1 * lda; b += 1; diff --git a/kernel/generic/trsm_utcopy_4.c b/kernel/generic/trsm_utcopy_4.c index bbba78d53..f83617224 100644 --- a/kernel/generic/trsm_utcopy_4.c +++ b/kernel/generic/trsm_utcopy_4.c @@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -192,7 +192,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -261,7 +261,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1 * lda; b += 1; diff --git a/kernel/generic/trsm_utcopy_6.c b/kernel/generic/trsm_utcopy_6.c index bbba78d53..f83617224 100644 --- a/kernel/generic/trsm_utcopy_6.c +++ b/kernel/generic/trsm_utcopy_6.c @@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -192,7 +192,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -261,7 +261,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -309,7 +309,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += 1 * lda; b += 1; diff --git a/kernel/generic/trsm_utcopy_8.c b/kernel/generic/trsm_utcopy_8.c index 531ac59e4..97da66f87 100644 --- a/kernel/generic/trsm_utcopy_8.c +++ b/kernel/generic/trsm_utcopy_8.c @@ -325,7 +325,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 62) = data63; *(b + 63) = data64; } - + a1 += 8 * lda; a2 += 8 * lda; a3 += 8 * lda; @@ -450,7 +450,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 30) = data31; *(b + 31) = data32; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -511,7 +511,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 2 * lda; a2 += 2 * lda; b += 16; @@ -637,7 +637,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data27; *(b + 15) = data28; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -678,7 +678,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data11; *(b + 7) = data12; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -744,7 +744,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data09; *(b + 3) = data10; } - + a1 += 2 * lda; a2 += 2 * lda; b += 4; @@ -791,7 +791,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT data01 = *(a1 + 0); *(b + 0) = data01; } - + a1 += lda; b += 1; i --; diff --git a/kernel/generic/zgemm3m_ncopy_1.c b/kernel/generic/zgemm3m_ncopy_1.c index 7ac734b4c..0aa947039 100644 --- a/kernel/generic/zgemm3m_ncopy_1.c +++ b/kernel/generic/zgemm3m_ncopy_1.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -70,16 +70,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, while (n > 0) { a_offset = a; a += lda; - + for (i = 0; i < m; i ++) { - + a1 = *(a_offset + 0); a2 = *(a_offset + 1); - + *(b + 0) = CMULT(a1, a2); - + a_offset += 2; - + b ++; } n --; diff --git a/kernel/generic/zgemm3m_ncopy_2.c b/kernel/generic/zgemm3m_ncopy_2.c index 702524a4e..dd5a732f2 100644 --- a/kernel/generic/zgemm3m_ncopy_2.c +++ b/kernel/generic/zgemm3m_ncopy_2.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -71,14 +71,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset = a; b_offset = b; - + j = (n >> 1); if (j > 0){ do{ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); @@ -91,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 += 2; a_offset2 += 2; - + b_offset += 2; } @@ -99,19 +99,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 1) { a_offset1 = a_offset; - + for (i = 0; i < m; i ++) { - + a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset + 0) = CMULT(a1, a2); - + a_offset1 += 2; - + b_offset += 1; } } diff --git a/kernel/generic/zgemm3m_ncopy_4.c b/kernel/generic/zgemm3m_ncopy_4.c index 1117d77bf..b4d23e236 100644 --- a/kernel/generic/zgemm3m_ncopy_4.c +++ b/kernel/generic/zgemm3m_ncopy_4.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset = a; b_offset = b; - + j = (n >> 2); if (j > 0){ do{ @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; - + b_offset += 4; } @@ -109,12 +109,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 2) { a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); @@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 += 2; a_offset2 += 2; - + b_offset += 2; } @@ -135,16 +135,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, if (n & 1) { a_offset1 = a_offset; - + for (i = 0; i < m; i ++) { - + a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset + 0) = CMULT(a1, a2); - + a_offset1 += 2; - + b_offset += 1; } } diff --git a/kernel/generic/zgemm3m_ncopy_8.c b/kernel/generic/zgemm3m_ncopy_8.c index 0c3cb5d76..d3e5da8fa 100644 --- a/kernel/generic/zgemm3m_ncopy_8.c +++ b/kernel/generic/zgemm3m_ncopy_8.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset = a; b_offset = b; - + j = (n >> 3); if (j > 0){ do{ @@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset7 = a_offset6 + lda; a_offset8 = a_offset7 + lda; a_offset += 8 * lda; - + for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); @@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, *(b_offset + 5) = CMULT(a11, a12); *(b_offset + 6) = CMULT(a13, a14); *(b_offset + 7) = CMULT(a15, a16); - + a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; @@ -138,21 +138,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset6 += 2; a_offset7 += 2; a_offset8 += 2; - + b_offset += 8; } j--; }while(j > 0); } - + if (n & 4){ a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); @@ -162,17 +162,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset3 + 1); a7 = *(a_offset4 + 0); a8 = *(a_offset4 + 1); - + *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); *(b_offset + 2) = CMULT(a5, a6); *(b_offset + 3) = CMULT(a7, a8); - + a_offset1 += 2; a_offset2 += 2; a_offset3 += 2; a_offset4 += 2; - + b_offset += 4; } } @@ -181,30 +181,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); - + *(b_offset + 0) = CMULT(a1, a2); *(b_offset + 1) = CMULT(a3, a4); - + a_offset1 += 2; a_offset2 += 2; - + b_offset += 2; } } if (n & 1){ a_offset1 = a_offset; - + for (i = 0; i < m; i ++) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset + 0) = CMULT(a1, a2); a_offset1 += 2; diff --git a/kernel/generic/zgemm3m_tcopy_1.c b/kernel/generic/zgemm3m_tcopy_1.c index 47cf7e58d..33e8ad6fb 100644 --- a/kernel/generic/zgemm3m_tcopy_1.c +++ b/kernel/generic/zgemm3m_tcopy_1.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -70,20 +70,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, while (n > 0) { a_offset = a; a += 2; - + for (i = 0; i < m; i ++) { - + a1 = *(a_offset + 0); a2 = *(a_offset + 1); - + *(b + 0) = CMULT(a1, a2); - + a_offset += lda; - + b ++; } n --; } - + return 0; } diff --git a/kernel/generic/zgemm3m_tcopy_2.c b/kernel/generic/zgemm3m_tcopy_2.c index f6fe10be3..b8a2626ef 100644 --- a/kernel/generic/zgemm3m_tcopy_2.c +++ b/kernel/generic/zgemm3m_tcopy_2.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + b_offset1 = b_offset; b_offset += 4; @@ -104,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 += 4; a_offset2 += 4; - + b_offset1 += m * 2; i --; }while(i > 0); @@ -119,7 +119,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); - + b_offset2 += 2; } @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; - + i = (n >> 1); if (i > 0){ do{ @@ -138,10 +138,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); - + *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); - + a_offset1 += 4; b_offset1 += 2 * m; @@ -153,10 +153,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset2 + 0) = CMULT(a1, a2); } } - + return 0; } diff --git a/kernel/generic/zgemm3m_tcopy_4.c b/kernel/generic/zgemm3m_tcopy_4.c index e0722627e..2c071ff91 100644 --- a/kernel/generic/zgemm3m_tcopy_4.c +++ b/kernel/generic/zgemm3m_tcopy_4.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -83,7 +83,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + b_offset1 = b_offset; b_offset += 16; @@ -151,7 +151,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset2 += 8; a_offset3 += 8; a_offset4 += 8; - + b_offset1 += m * 4; i --; }while(i > 0); @@ -167,12 +167,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset2 + 1); a7 = *(a_offset2 + 2); a8 = *(a_offset2 + 3); - + *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); - + a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); @@ -181,17 +181,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset4 + 1); a7 = *(a_offset4 + 2); a8 = *(a_offset4 + 3); - + *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); *(b_offset2 + 7) = CMULT(a7, a8); - + a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; - + b_offset2 += 8; } @@ -210,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, *(b_offset3 + 1) = CMULT(a3, a4); *(b_offset3 + 2) = CMULT(a5, a6); *(b_offset3 + 3) = CMULT(a7, a8); - + b_offset3 += 4; } @@ -222,10 +222,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + b_offset1 = b_offset; b_offset += 8; - + i = (n >> 2); if (i > 0){ do{ @@ -238,12 +238,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); - + *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); @@ -252,15 +252,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); - + *(b_offset1 + 4) = CMULT(a1, a2); *(b_offset1 + 5) = CMULT(a3, a4); *(b_offset1 + 6) = CMULT(a5, a6); *(b_offset1 + 7) = CMULT(a7, a8); - + a_offset1 += 8; a_offset2 += 8; - + b_offset1 += m * 4; i --; }while(i > 0); @@ -275,23 +275,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset2 + 1); a7 = *(a_offset2 + 2); a8 = *(a_offset2 + 3); - + *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); - + a_offset1 += 4; a_offset2 += 4; b_offset2 += 4; } - + if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); a3 = *(a_offset2 + 0); a4 = *(a_offset2 + 1); - + *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); @@ -302,7 +302,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, if (m & 1){ a_offset1 = a_offset; b_offset1 = b_offset; - + i = (n >> 2); if (i > 0){ do{ @@ -314,12 +314,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); - + *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); *(b_offset1 + 3) = CMULT(a7, a8); - + a_offset1 += 8; b_offset1 += 4 * m; @@ -333,17 +333,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); - + *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); - + a_offset1 += 4; } - + if (n & 1) { a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset3 + 0) = CMULT(a1, a2); } } diff --git a/kernel/generic/zgemm3m_tcopy_8.c b/kernel/generic/zgemm3m_tcopy_8.c index e68bccfba..fddbdd8cc 100644 --- a/kernel/generic/zgemm3m_tcopy_8.c +++ b/kernel/generic/zgemm3m_tcopy_8.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, #ifdef USE_ALPHA - FLOAT alpha_r, FLOAT alpha_i, + FLOAT alpha_r, FLOAT alpha_i, #endif FLOAT *b){ @@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset8 = a_offset7 + lda; a_offset += 8 * lda; - + b_offset1 = b_offset; b_offset += 64; @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); - + *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); @@ -156,7 +156,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset2 + 13); a15 = *(a_offset2 + 14); a16 = *(a_offset2 + 15); - + *(b_offset1 + 8) = CMULT(a1, a2); *(b_offset1 + 9) = CMULT(a3, a4); *(b_offset1 + 10) = CMULT(a5, a6); @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset3 + 13); a15 = *(a_offset3 + 14); a16 = *(a_offset3 + 15); - + *(b_offset1 + 16) = CMULT(a1, a2); *(b_offset1 + 17) = CMULT(a3, a4); *(b_offset1 + 18) = CMULT(a5, a6); @@ -208,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset4 + 13); a15 = *(a_offset4 + 14); a16 = *(a_offset4 + 15); - + *(b_offset1 + 24) = CMULT(a1, a2); *(b_offset1 + 25) = CMULT(a3, a4); *(b_offset1 + 26) = CMULT(a5, a6); @@ -234,7 +234,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset5 + 13); a15 = *(a_offset5 + 14); a16 = *(a_offset5 + 15); - + *(b_offset1 + 32) = CMULT(a1, a2); *(b_offset1 + 33) = CMULT(a3, a4); *(b_offset1 + 34) = CMULT(a5, a6); @@ -260,7 +260,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset6 + 13); a15 = *(a_offset6 + 14); a16 = *(a_offset6 + 15); - + *(b_offset1 + 40) = CMULT(a1, a2); *(b_offset1 + 41) = CMULT(a3, a4); *(b_offset1 + 42) = CMULT(a5, a6); @@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset7 + 13); a15 = *(a_offset7 + 14); a16 = *(a_offset7 + 15); - + *(b_offset1 + 48) = CMULT(a1, a2); *(b_offset1 + 49) = CMULT(a3, a4); *(b_offset1 + 50) = CMULT(a5, a6); @@ -312,7 +312,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset8 + 13); a15 = *(a_offset8 + 14); a16 = *(a_offset8 + 15); - + *(b_offset1 + 56) = CMULT(a1, a2); *(b_offset1 + 57) = CMULT(a3, a4); *(b_offset1 + 58) = CMULT(a5, a6); @@ -330,7 +330,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset6 += 16; a_offset7 += 16; a_offset8 += 16; - + b_offset1 += m * 8; i --; }while(i > 0); @@ -345,12 +345,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); - + *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); @@ -359,7 +359,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); - + *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); @@ -373,12 +373,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset3 + 5); a7 = *(a_offset3 + 6); a8 = *(a_offset3 + 7); - + *(b_offset2 + 8) = CMULT(a1, a2); *(b_offset2 + 9) = CMULT(a3, a4); *(b_offset2 + 10) = CMULT(a5, a6); *(b_offset2 + 11) = CMULT(a7, a8); - + a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); @@ -387,12 +387,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset4 + 5); a7 = *(a_offset4 + 6); a8 = *(a_offset4 + 7); - + *(b_offset2 + 12) = CMULT(a1, a2); *(b_offset2 + 13) = CMULT(a3, a4); *(b_offset2 + 14) = CMULT(a5, a6); *(b_offset2 + 15) = CMULT(a7, a8); - + a1 = *(a_offset5 + 0); a2 = *(a_offset5 + 1); a3 = *(a_offset5 + 2); @@ -401,12 +401,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset5 + 5); a7 = *(a_offset5 + 6); a8 = *(a_offset5 + 7); - + *(b_offset2 + 16) = CMULT(a1, a2); *(b_offset2 + 17) = CMULT(a3, a4); *(b_offset2 + 18) = CMULT(a5, a6); *(b_offset2 + 19) = CMULT(a7, a8); - + a1 = *(a_offset6 + 0); a2 = *(a_offset6 + 1); a3 = *(a_offset6 + 2); @@ -415,12 +415,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset6 + 5); a7 = *(a_offset6 + 6); a8 = *(a_offset6 + 7); - + *(b_offset2 + 20) = CMULT(a1, a2); *(b_offset2 + 21) = CMULT(a3, a4); *(b_offset2 + 22) = CMULT(a5, a6); *(b_offset2 + 23) = CMULT(a7, a8); - + a1 = *(a_offset7 + 0); a2 = *(a_offset7 + 1); a3 = *(a_offset7 + 2); @@ -429,12 +429,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset7 + 5); a7 = *(a_offset7 + 6); a8 = *(a_offset7 + 7); - + *(b_offset2 + 24) = CMULT(a1, a2); *(b_offset2 + 25) = CMULT(a3, a4); *(b_offset2 + 26) = CMULT(a5, a6); *(b_offset2 + 27) = CMULT(a7, a8); - + a1 = *(a_offset8 + 0); a2 = *(a_offset8 + 1); a3 = *(a_offset8 + 2); @@ -443,12 +443,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset8 + 5); a7 = *(a_offset8 + 6); a8 = *(a_offset8 + 7); - + *(b_offset2 + 28) = CMULT(a1, a2); *(b_offset2 + 29) = CMULT(a3, a4); *(b_offset2 + 30) = CMULT(a5, a6); *(b_offset2 + 31) = CMULT(a7, a8); - + a_offset1 += 8; a_offset2 += 8; a_offset3 += 8; @@ -457,7 +457,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset6 += 8; a_offset7 += 8; a_offset8 += 8; - + b_offset2 += 32; } @@ -466,15 +466,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); - + *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); - + *(b_offset3 + 2) = CMULT(a1, a2); *(b_offset3 + 3) = CMULT(a3, a4); @@ -482,50 +482,50 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); - + *(b_offset3 + 4) = CMULT(a1, a2); *(b_offset3 + 5) = CMULT(a3, a4); - + a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); - + *(b_offset3 + 6) = CMULT(a1, a2); *(b_offset3 + 7) = CMULT(a3, a4); - + a1 = *(a_offset5 + 0); a2 = *(a_offset5 + 1); a3 = *(a_offset5 + 2); a4 = *(a_offset5 + 3); - + *(b_offset3 + 8) = CMULT(a1, a2); *(b_offset3 + 9) = CMULT(a3, a4); - + a1 = *(a_offset6 + 0); a2 = *(a_offset6 + 1); a3 = *(a_offset6 + 2); a4 = *(a_offset6 + 3); - + *(b_offset3 + 10) = CMULT(a1, a2); *(b_offset3 + 11) = CMULT(a3, a4); - + a1 = *(a_offset7 + 0); a2 = *(a_offset7 + 1); a3 = *(a_offset7 + 2); a4 = *(a_offset7 + 3); - + *(b_offset3 + 12) = CMULT(a1, a2); *(b_offset3 + 13) = CMULT(a3, a4); - + a1 = *(a_offset8 + 0); a2 = *(a_offset8 + 1); a3 = *(a_offset8 + 2); a4 = *(a_offset8 + 3); - + *(b_offset3 + 14) = CMULT(a1, a2); *(b_offset3 + 15) = CMULT(a3, a4); - + a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; @@ -534,49 +534,49 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset6 += 4; a_offset7 += 4; a_offset8 += 4; - + b_offset3 += 16; } if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset4 + 0) = CMULT(a1, a2); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); - + *(b_offset4 + 1) = CMULT(a1, a2); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); - + *(b_offset4 + 2) = CMULT(a1, a2); - + a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); - + *(b_offset4 + 3) = CMULT(a1, a2); - + a1 = *(a_offset5 + 0); a2 = *(a_offset5 + 1); - + *(b_offset4 + 4) = CMULT(a1, a2); - + a1 = *(a_offset6 + 0); a2 = *(a_offset6 + 1); - + *(b_offset4 + 5) = CMULT(a1, a2); - + a1 = *(a_offset7 + 0); a2 = *(a_offset7 + 1); - + *(b_offset4 + 6) = CMULT(a1, a2); - + a1 = *(a_offset8 + 0); a2 = *(a_offset8 + 1); - + *(b_offset4 + 7) = CMULT(a1, a2); b_offset4 += 8; @@ -592,7 +592,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset3 = a_offset2 + lda; a_offset4 = a_offset3 + lda; a_offset += 4 * lda; - + b_offset1 = b_offset; b_offset += 32; @@ -615,7 +615,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); - + *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); @@ -641,7 +641,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset2 + 13); a15 = *(a_offset2 + 14); a16 = *(a_offset2 + 15); - + *(b_offset1 + 8) = CMULT(a1, a2); *(b_offset1 + 9) = CMULT(a3, a4); *(b_offset1 + 10) = CMULT(a5, a6); @@ -667,7 +667,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset3 + 13); a15 = *(a_offset3 + 14); a16 = *(a_offset3 + 15); - + *(b_offset1 + 16) = CMULT(a1, a2); *(b_offset1 + 17) = CMULT(a3, a4); *(b_offset1 + 18) = CMULT(a5, a6); @@ -693,7 +693,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset4 + 13); a15 = *(a_offset4 + 14); a16 = *(a_offset4 + 15); - + *(b_offset1 + 24) = CMULT(a1, a2); *(b_offset1 + 25) = CMULT(a3, a4); *(b_offset1 + 26) = CMULT(a5, a6); @@ -707,7 +707,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset2 += 16; a_offset3 += 16; a_offset4 += 16; - + b_offset1 += m * 8; i --; }while(i > 0); @@ -722,12 +722,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); - + *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); @@ -736,7 +736,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); - + *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); @@ -750,12 +750,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset3 + 5); a7 = *(a_offset3 + 6); a8 = *(a_offset3 + 7); - + *(b_offset2 + 8) = CMULT(a1, a2); *(b_offset2 + 9) = CMULT(a3, a4); *(b_offset2 + 10) = CMULT(a5, a6); *(b_offset2 + 11) = CMULT(a7, a8); - + a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); @@ -764,17 +764,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset4 + 5); a7 = *(a_offset4 + 6); a8 = *(a_offset4 + 7); - + *(b_offset2 + 12) = CMULT(a1, a2); *(b_offset2 + 13) = CMULT(a3, a4); *(b_offset2 + 14) = CMULT(a5, a6); *(b_offset2 + 15) = CMULT(a7, a8); - + a_offset1 += 8; a_offset2 += 8; a_offset3 += 8; a_offset4 += 8; - + b_offset2 += 16; } @@ -783,15 +783,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); - + *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); - + *(b_offset3 + 2) = CMULT(a1, a2); *(b_offset3 + 3) = CMULT(a3, a4); @@ -799,45 +799,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset3 + 1); a3 = *(a_offset3 + 2); a4 = *(a_offset3 + 3); - + *(b_offset3 + 4) = CMULT(a1, a2); *(b_offset3 + 5) = CMULT(a3, a4); - + a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); a3 = *(a_offset4 + 2); a4 = *(a_offset4 + 3); - + *(b_offset3 + 6) = CMULT(a1, a2); *(b_offset3 + 7) = CMULT(a3, a4); - + a_offset1 += 4; a_offset2 += 4; a_offset3 += 4; a_offset4 += 4; - + b_offset3 += 8; } if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset4 + 0) = CMULT(a1, a2); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); - + *(b_offset4 + 1) = CMULT(a1, a2); a1 = *(a_offset3 + 0); a2 = *(a_offset3 + 1); - + *(b_offset4 + 2) = CMULT(a1, a2); - + a1 = *(a_offset4 + 0); a2 = *(a_offset4 + 1); - + *(b_offset4 + 3) = CMULT(a1, a2); b_offset4 += 4; @@ -848,7 +848,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 = a_offset; a_offset2 = a_offset1 + lda; a_offset += 2 * lda; - + b_offset1 = b_offset; b_offset += 16; @@ -871,7 +871,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); - + *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); @@ -897,7 +897,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset2 + 13); a15 = *(a_offset2 + 14); a16 = *(a_offset2 + 15); - + *(b_offset1 + 8) = CMULT(a1, a2); *(b_offset1 + 9) = CMULT(a3, a4); *(b_offset1 + 10) = CMULT(a5, a6); @@ -909,7 +909,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 += 16; a_offset2 += 16; - + b_offset1 += m * 8; i --; }while(i > 0); @@ -924,12 +924,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); - + *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); *(b_offset2 + 3) = CMULT(a7, a8); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); @@ -938,7 +938,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset2 + 5); a7 = *(a_offset2 + 6); a8 = *(a_offset2 + 7); - + *(b_offset2 + 4) = CMULT(a1, a2); *(b_offset2 + 5) = CMULT(a3, a4); *(b_offset2 + 6) = CMULT(a5, a6); @@ -946,7 +946,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a_offset1 += 8; a_offset2 += 8; - + b_offset2 += 8; } @@ -955,18 +955,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); - + *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); a3 = *(a_offset2 + 2); a4 = *(a_offset2 + 3); - + *(b_offset3 + 2) = CMULT(a1, a2); *(b_offset3 + 3) = CMULT(a3, a4); - + a_offset1 += 4; a_offset2 += 4; @@ -976,12 +976,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset4 + 0) = CMULT(a1, a2); - + a1 = *(a_offset2 + 0); a2 = *(a_offset2 + 1); - + *(b_offset4 + 1) = CMULT(a1, a2); b_offset4 += 2; @@ -1011,7 +1011,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a14 = *(a_offset1 + 13); a15 = *(a_offset1 + 14); a16 = *(a_offset1 + 15); - + *(b_offset1 + 0) = CMULT(a1, a2); *(b_offset1 + 1) = CMULT(a3, a4); *(b_offset1 + 2) = CMULT(a5, a6); @@ -1022,7 +1022,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, *(b_offset1 + 7) = CMULT(a15, a16); a_offset1 += 16; - + b_offset1 += m * 8; i --; }while(i > 0); @@ -1037,7 +1037,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a6 = *(a_offset1 + 5); a7 = *(a_offset1 + 6); a8 = *(a_offset1 + 7); - + *(b_offset2 + 0) = CMULT(a1, a2); *(b_offset2 + 1) = CMULT(a3, a4); *(b_offset2 + 2) = CMULT(a5, a6); @@ -1052,7 +1052,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, a2 = *(a_offset1 + 1); a3 = *(a_offset1 + 2); a4 = *(a_offset1 + 3); - + *(b_offset3 + 0) = CMULT(a1, a2); *(b_offset3 + 1) = CMULT(a3, a4); @@ -1063,10 +1063,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, if (n & 1){ a1 = *(a_offset1 + 0); a2 = *(a_offset1 + 1); - + *(b_offset4 + 0) = CMULT(a1, a2); } } - + return 0; } diff --git a/kernel/generic/zgemm_beta.c b/kernel/generic/zgemm_beta.c index b7a77a27a..7954e22e3 100644 --- a/kernel/generic/zgemm_beta.c +++ b/kernel/generic/zgemm_beta.c @@ -41,7 +41,7 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta_r, FLOAT beta_i, FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, @@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, do { c_offset1 = c_offset; c_offset += ldc; - + i = (m >> 1); if (i > 0){ do { @@ -105,12 +105,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, atemp2 = *(c_offset1 + 1); atemp3 = *(c_offset1 + 2); atemp4 = *(c_offset1 + 3); - + btemp1 = beta_r * atemp1; btemp2 = beta_i * atemp2; btemp3 = beta_r * atemp2; btemp4 = beta_i * atemp1; - + ctemp1 = btemp1 - btemp2; ctemp2 = btemp3 + btemp4; @@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, btemp2 = beta_i * atemp4; btemp3 = beta_r * atemp4; btemp4 = beta_i * atemp3; - + ctemp3 = btemp1 - btemp2; ctemp4 = btemp3 + btemp4; @@ -136,15 +136,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, do { atemp1 = *(c_offset1 + 0); atemp2 = *(c_offset1 + 1); - + btemp1 = beta_r * atemp1; btemp2 = beta_i * atemp2; btemp3 = beta_r * atemp2; btemp4 = beta_i * atemp1; - + ctemp1 = btemp1 - btemp2; ctemp2 = btemp3 + btemp4; - + *(c_offset1 + 0) = ctemp1; *(c_offset1 + 1) = ctemp2; c_offset1 += 2; diff --git a/kernel/generic/zgemm_ncopy_1.c b/kernel/generic/zgemm_ncopy_1.c index 6679a3360..bc2b89779 100644 --- a/kernel/generic/zgemm_ncopy_1.c +++ b/kernel/generic/zgemm_ncopy_1.c @@ -49,14 +49,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset = a; b_offset = b; - + lda *= 2; i = n; - + if (i > 0){ do { - + j = (m >> 2); if (j > 0){ do{ @@ -64,28 +64,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); - + ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; - + *(b_offset + 4) = ctemp5; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp7; *(b_offset + 7) = ctemp8; - + a_offset += 8; b_offset += 8; j --; } while(j>0); } - + j = (m & 3); if (j > 0){ do{ diff --git a/kernel/generic/zgemm_ncopy_2.c b/kernel/generic/zgemm_ncopy_2.c index 2d5f2555d..402d6e3e9 100644 --- a/kernel/generic/zgemm_ncopy_2.c +++ b/kernel/generic/zgemm_ncopy_2.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset = a; b_offset = b; - + lda *= 2; i = (n >> 1); @@ -69,42 +69,42 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + ctemp5 = *(a_offset1 + 2); ctemp6 = *(a_offset1 + 3); ctemp7 = *(a_offset2 + 2); ctemp8 = *(a_offset2 + 3); - + ctemp9 = *(a_offset1 + 4); ctemp10 = *(a_offset1 + 5); ctemp11 = *(a_offset2 + 4); ctemp12 = *(a_offset2 + 5); - + ctemp13 = *(a_offset1 + 6); ctemp14 = *(a_offset1 + 7); ctemp15 = *(a_offset2 + 6); ctemp16 = *(a_offset2 + 7); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; - + *(b_offset + 4) = ctemp5; *(b_offset + 5) = ctemp6; *(b_offset + 6) = ctemp7; *(b_offset + 7) = ctemp8; - + *(b_offset + 8) = ctemp9; *(b_offset + 9) = ctemp10; *(b_offset +10) = ctemp11; *(b_offset +11) = ctemp12; - + *(b_offset +12) = ctemp13; *(b_offset +13) = ctemp14; *(b_offset +14) = ctemp15; *(b_offset +15) = ctemp16; - + a_offset1 += 8; a_offset2 += 8; b_offset += 16; @@ -119,12 +119,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset2 + 0); ctemp4 = *(a_offset2 + 1); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; - + a_offset1 += 2; a_offset2 += 2; b_offset += 4; @@ -134,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ i --; } while(i>0); } - + if (n & 1){ j = (m >> 2); if (j > 0){ @@ -143,22 +143,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset + 1); ctemp5 = *(a_offset + 2); ctemp6 = *(a_offset + 3); - + ctemp9 = *(a_offset + 4); ctemp10 = *(a_offset + 5); ctemp13 = *(a_offset + 6); ctemp14 = *(a_offset + 7); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp5; *(b_offset + 3) = ctemp6; - + *(b_offset + 4) = ctemp9; *(b_offset + 5) = ctemp10; *(b_offset + 6) = ctemp13; *(b_offset + 7) = ctemp14; - + a_offset += 8; b_offset += 8; j --; diff --git a/kernel/generic/zgemm_ncopy_4.c b/kernel/generic/zgemm_ncopy_4.c index abd1d5784..0c2959b5f 100644 --- a/kernel/generic/zgemm_ncopy_4.c +++ b/kernel/generic/zgemm_ncopy_4.c @@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset = a; boffset = b; lda *= 2; - + #if 0 fprintf(stderr, "m = %d n = %d\n", m,n ); #endif @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; - + i = (m >> 2); if (i > 0){ do{ @@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -102,7 +102,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp18; *(boffset + 6) = ctemp25; *(boffset + 7) = ctemp26; - + *(boffset + 8) = ctemp03; *(boffset + 9) = ctemp04; *(boffset + 10) = ctemp11; @@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp20; *(boffset + 14) = ctemp27; *(boffset + 15) = ctemp28; - + *(boffset + 16) = ctemp05; *(boffset + 17) = ctemp06; *(boffset + 18) = ctemp13; @@ -162,22 +162,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp05; @@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp10; *(boffset + 6) = ctemp13; *(boffset + 7) = ctemp14; - + *(boffset + 8) = ctemp03; *(boffset + 9) = ctemp04; *(boffset + 10) = ctemp07; @@ -195,27 +195,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp12; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; - + aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; aoffset4 += 4; boffset += 16; } - + if (m & 1) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); - + ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -224,7 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; @@ -234,12 +234,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 2){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + i = (m >> 2); if (i > 0){ do{ @@ -251,7 +251,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -260,7 +260,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp09; @@ -269,7 +269,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp04; *(boffset + 6) = ctemp11; *(boffset + 7) = ctemp12; - + *(boffset + 8) = ctemp05; *(boffset + 9) = ctemp06; *(boffset + 10) = ctemp13; @@ -278,25 +278,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp08; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; - + aoffset1 += 8; aoffset2 += 8; boffset += 16; i --; }while(i > 0); } - + if (m & 2) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp05; @@ -305,33 +305,33 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp04; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + aoffset1 += 4; aoffset2 += 4; boffset += 8; } - + if (m & 1) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); - + ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; - + aoffset1 += 2; aoffset2 += 2; boffset += 4; } } - + if (n & 1){ aoffset1 = aoffset; - + i = (m >> 2); if (i > 0){ do{ @@ -343,7 +343,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -352,36 +352,36 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + aoffset1 += 8; boffset += 8; i --; }while(i > 0); } - + if (m & 2) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; - + aoffset1 += 4; boffset += 4; } - + if (m & 1) { ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; } } - + return 0; } diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c index 839bd5939..404a3cd4a 100644 --- a/kernel/generic/zgemm_ncopy_4_sandy.c +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -33,13 +33,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; BLASLONG ii; FLOAT *src0,*src1,*src2,*src3,*dest0; - for (j=0; j> 3); if (j > 0){ do{ @@ -68,7 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset7 = aoffset6 + lda; aoffset8 = aoffset7 + lda; aoffset += 8 * lda; - + i = m; if (i > 0){ do{ @@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset7 + 1); ctemp15 = *(aoffset8 + 0); ctemp16 = *(aoffset8 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -114,7 +114,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset6 += 2; aoffset7 += 2; aoffset8 += 2; - + boffset += 16; i --; }while(i > 0); @@ -122,14 +122,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ j--; }while(j > 0); } /* end of if(j > 0) */ - + if (n & 4){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; aoffset4 = aoffset3 + lda; aoffset += 4 * lda; - + i = m; if (i > 0){ do{ @@ -141,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset3 + 1); ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -166,7 +166,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + i = m; if (i > 0){ do{ @@ -174,24 +174,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; - + aoffset1 += 2; aoffset2 += 2; boffset += 4; i --; }while(i > 0); } - + } /* end of if(j > 0) */ if (n & 1){ aoffset1 = aoffset; - + i = m; if (i > 0){ do{ @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ i --; }while(i > 0); } - + } /* end of if(j > 0) */ return 0; diff --git a/kernel/generic/zgemm_ncopy_8_sandy.c b/kernel/generic/zgemm_ncopy_8_sandy.c index ed580a126..6e8e894b2 100644 --- a/kernel/generic/zgemm_ncopy_8_sandy.c +++ b/kernel/generic/zgemm_ncopy_8_sandy.c @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -33,13 +33,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; BLASLONG ii; FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; - for (j=0; j 0){ do { b_offset1 = b_offset; b_offset += 2; - + i = (n >> 2); if (i > 0){ do{ @@ -68,45 +68,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); - + ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; - + b_offset1 += m; - + *(b_offset1 + 0) = ctemp3; *(b_offset1 + 1) = ctemp4; - + b_offset1 += m; - + *(b_offset1 + 0) = ctemp5; *(b_offset1 + 1) = ctemp6; b_offset1 += m; - + *(b_offset1 + 0) = ctemp7; *(b_offset1 + 1) = ctemp8; - + b_offset1 += m; a_offset += 8; i --; } while(i>0); } - + i = (n & 3); if (i > 0){ do { ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; - + b_offset1 += m; a_offset += 2; i --; diff --git a/kernel/generic/zgemm_tcopy_2.c b/kernel/generic/zgemm_tcopy_2.c index 75aff7f97..70e202b71 100644 --- a/kernel/generic/zgemm_tcopy_2.c +++ b/kernel/generic/zgemm_tcopy_2.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset = a; b_offset = b; - + b_offset2 = b + m * (n & ~1) * 2; lda *= 2; @@ -73,46 +73,46 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset1 + 4); ctemp6 = *(a_offset1 + 5); ctemp7 = *(a_offset1 + 6); ctemp8 = *(a_offset1 + 7); - + ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); - + ctemp13 = *(a_offset2 + 4); ctemp14 = *(a_offset2 + 5); ctemp15 = *(a_offset2 + 6); ctemp16 = *(a_offset2 + 7); - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; - + *(b_offset1 + 4) = ctemp9; *(b_offset1 + 5) = ctemp10; *(b_offset1 + 6) = ctemp11; *(b_offset1 + 7) = ctemp12; - + b_offset1 += m * 4; - + *(b_offset1 + 0) = ctemp5; *(b_offset1 + 1) = ctemp6; *(b_offset1 + 2) = ctemp7; *(b_offset1 + 3) = ctemp8; - + *(b_offset1 + 4) = ctemp13; *(b_offset1 + 5) = ctemp14; *(b_offset1 + 6) = ctemp15; *(b_offset1 + 7) = ctemp16; - + b_offset1 += m * 4; - + a_offset1 += 8; a_offset2 += 8; i --; @@ -124,33 +124,33 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); - + *(b_offset1 + 0) = ctemp1; *(b_offset1 + 1) = ctemp2; *(b_offset1 + 2) = ctemp3; *(b_offset1 + 3) = ctemp4; - + *(b_offset1 + 4) = ctemp9; *(b_offset1 + 5) = ctemp10; *(b_offset1 + 6) = ctemp11; *(b_offset1 + 7) = ctemp12; - + b_offset1 += m * 4; a_offset1 += 4; a_offset2 += 4; } - + if (n & 1){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); - + *(b_offset2 + 0) = ctemp1; *(b_offset2 + 1) = ctemp2; *(b_offset2 + 2) = ctemp9; @@ -169,45 +169,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); - + ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; - + b_offset += m * 4; - + *(b_offset + 0) = ctemp5; *(b_offset + 1) = ctemp6; *(b_offset + 2) = ctemp7; *(b_offset + 3) = ctemp8; - + b_offset += m * 4; a_offset += 8; i --; } while(i > 0); } - + if (n & 2){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); - + *(b_offset + 0) = ctemp1; *(b_offset + 1) = ctemp2; *(b_offset + 2) = ctemp3; *(b_offset + 3) = ctemp4; - + b_offset += m * 4; a_offset += 4; } - + if (n & 1){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); diff --git a/kernel/generic/zgemm_tcopy_4.c b/kernel/generic/zgemm_tcopy_4.c index c61d9d52a..3c12a6f96 100644 --- a/kernel/generic/zgemm_tcopy_4.c +++ b/kernel/generic/zgemm_tcopy_4.c @@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; - + *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; @@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 21) = ctemp22; *(boffset1 + 22) = ctemp23; *(boffset1 + 23) = ctemp24; - + *(boffset1 + 24) = ctemp25; *(boffset1 + 25) = ctemp26; *(boffset1 + 26) = ctemp27; @@ -174,17 +174,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); - + *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; - + *(boffset2 + 8) = ctemp09; *(boffset2 + 9) = ctemp10; *(boffset2 + 10) = ctemp11; @@ -202,12 +202,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 13) = ctemp14; *(boffset2 + 14) = ctemp15; *(boffset2 + 15) = ctemp16; - + aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; aoffset4 += 4; - + boffset2 += 16; } @@ -217,13 +217,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); - + ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); - + *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; @@ -232,12 +232,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset3 + 5) = ctemp06; *(boffset3 + 6) = ctemp07; *(boffset3 + 7) = ctemp08; - + aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; - + boffset3 += 8; } j--; @@ -248,10 +248,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + boffset1 = boffset; boffset += 16; - + i = (n >> 2); if (i > 0){ do{ @@ -263,7 +263,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -272,7 +272,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - + *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; - + *(boffset1 + 8) = ctemp09; *(boffset1 + 9) = ctemp10; *(boffset1 + 10) = ctemp11; @@ -290,12 +290,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 13) = ctemp14; *(boffset1 + 14) = ctemp15; *(boffset1 + 15) = ctemp16; - + aoffset1 += 8; aoffset2 += 8; aoffset3 += 8; aoffset4 += 8; - + boffset1 += m * 8; i --; }while(i > 0); @@ -306,12 +306,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); - + *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; @@ -320,34 +320,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 5) = ctemp06; *(boffset2 + 6) = ctemp07; *(boffset2 + 7) = ctemp08; - + aoffset1 += 4; aoffset2 += 4; - + boffset2 += 8; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; *(boffset3 + 2) = ctemp03; *(boffset3 + 3) = ctemp04; - + aoffset1 += 2; aoffset2 += 2; boffset3 += 4; } } - + if (m & 1){ aoffset1 = aoffset; boffset1 = boffset; - + i = (n >> 2); if (i > 0){ do{ @@ -359,7 +359,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + *(boffset1 + 0) = ctemp01; *(boffset1 + 1) = ctemp02; *(boffset1 + 2) = ctemp03; @@ -368,7 +368,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = ctemp06; *(boffset1 + 6) = ctemp07; *(boffset1 + 7) = ctemp08; - + aoffset1 += 8; boffset1 += m * 8; i --; @@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + *(boffset2 + 0) = ctemp01; *(boffset2 + 1) = ctemp02; *(boffset2 + 2) = ctemp03; @@ -389,11 +389,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 += 4; boffset2 += 4; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); - + *(boffset3 + 0) = ctemp01; *(boffset3 + 1) = ctemp02; } diff --git a/kernel/generic/zgemm_tcopy_4_sandy.c b/kernel/generic/zgemm_tcopy_4_sandy.c index 1ae4a4e68..7e148659d 100644 --- a/kernel/generic/zgemm_tcopy_4_sandy.c +++ b/kernel/generic/zgemm_tcopy_4_sandy.c @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; @@ -46,7 +46,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) ii = col&-2; ii = ii*(2*row); dest1 = dest+ii; - for (j=0; j> 1); if (i > 0){ do{ @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; - + *(boffset + 16) = ctemp17; *(boffset + 17) = ctemp18; *(boffset + 18) = ctemp19; @@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; - + i = (m >> 1); if (i > 0){ do{ @@ -212,7 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + *(boffset + 8) = ctemp09; *(boffset + 9) = ctemp10; *(boffset + 10) = ctemp11; @@ -239,15 +239,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = ctemp14; *(boffset + 14) = ctemp15; *(boffset + 15) = ctemp16; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -257,7 +257,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + *(boffset + 0) = ctemp01; *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; @@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + boffset += 8; } } @@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; - + i = (m >> 1); if (i > 0){ do{ @@ -297,15 +297,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = ctemp06; *(boffset + 6) = ctemp07; *(boffset + 7) = ctemp08; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -316,7 +316,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; - + boffset += 4; } } @@ -325,7 +325,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; - + i = (m >> 1); if (i > 0){ do{ @@ -338,15 +338,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = ctemp02; *(boffset + 2) = ctemp03; *(boffset + 3) = ctemp04; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); diff --git a/kernel/generic/zgemm_tcopy_8_sandy.c b/kernel/generic/zgemm_tcopy_8_sandy.c index b53dd3ec0..e5197858e 100644 --- a/kernel/generic/zgemm_tcopy_8_sandy.c +++ b/kernel/generic/zgemm_tcopy_8_sandy.c @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) { BLASLONG i,j; BLASLONG idx=0; @@ -49,7 +49,7 @@ int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) ii = col&-2; ii = ii*(2*row); dest1 = dest+ii; - for (j=0; j 0) { + while (n > 0) { FLOAT beta_r = y[0]; FLOAT beta_i = y[1]; -#ifndef XCONJ +#ifndef XCONJ AXPYU_K #else AXPYC_K #endif - (m, 0, 0, + (m, 0, 0, #ifndef CONJ alpha_r * beta_r - alpha_i * beta_i, alpha_r * beta_i + alpha_i * beta_r, @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, -alpha_r * beta_i + alpha_i * beta_r, #endif X, 1, a, 1, NULL, 0); - + a += lda; y += incy; n --; diff --git a/kernel/generic/zhemm3m_lcopy_1.c b/kernel/generic/zhemm3m_lcopy_1.c index 72f473de3..0ffbbcf1b 100644 --- a/kernel/generic/zhemm3m_lcopy_1.c +++ b/kernel/generic/zhemm3m_lcopy_1.c @@ -69,14 +69,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON lda *= 2; js = n; - + while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; - + i = m; - + while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); @@ -86,17 +86,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { data01 = CMULT(*(ao1 + 0), ZERO); } - + if (offset > 0) ao1 += lda; else ao1 += 2; - + b[ 0] = data01; - + b ++; - + offset --; i --; } - + posX ++; js --; } diff --git a/kernel/generic/zhemm3m_lcopy_2.c b/kernel/generic/zhemm3m_lcopy_2.c index f0da12cca..517ed645d 100644 --- a/kernel/generic/zhemm3m_lcopy_2.c +++ b/kernel/generic/zhemm3m_lcopy_2.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = (n >> 1); while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -116,7 +116,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; diff --git a/kernel/generic/zhemm3m_lcopy_4.c b/kernel/generic/zhemm3m_lcopy_4.c index 7e958f180..a407838f6 100644 --- a/kernel/generic/zhemm3m_lcopy_4.c +++ b/kernel/generic/zhemm3m_lcopy_4.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = (n >> 2); while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -142,7 +142,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -187,7 +187,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; diff --git a/kernel/generic/zhemm3m_lcopy_8.c b/kernel/generic/zhemm3m_lcopy_8.c index 86600b527..856d5e54b 100644 --- a/kernel/generic/zhemm3m_lcopy_8.c +++ b/kernel/generic/zhemm3m_lcopy_8.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -219,7 +219,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -289,7 +289,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -333,7 +333,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; diff --git a/kernel/generic/zhemm3m_ucopy_1.c b/kernel/generic/zhemm3m_ucopy_1.c index a6d4975e2..43f6d997f 100644 --- a/kernel/generic/zhemm3m_ucopy_1.c +++ b/kernel/generic/zhemm3m_ucopy_1.c @@ -69,15 +69,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON lda *= 2; js = n; - + while (js > 0){ - + offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; - + i = m; - + while (i > 0) { if (offset > 0) { data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); @@ -87,17 +87,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { data01 = CMULT(*(ao1 + 0), ZERO); } - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; - + b ++; - + offset --; i --; } - + posX ++; js --; } diff --git a/kernel/generic/zhemm3m_ucopy_2.c b/kernel/generic/zhemm3m_ucopy_2.c index fecbae615..2a20fe01f 100644 --- a/kernel/generic/zhemm3m_ucopy_2.c +++ b/kernel/generic/zhemm3m_ucopy_2.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = (n >> 1); while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON break; } } - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -109,14 +109,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; js --; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { data01 = CMULT(*(ao1 + 0), ZERO); } - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zhemm3m_ucopy_4.c b/kernel/generic/zhemm3m_ucopy_4.c index 6a45c7ed6..879ae2d3f 100644 --- a/kernel/generic/zhemm3m_ucopy_4.c +++ b/kernel/generic/zhemm3m_ucopy_4.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON break; } } - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -169,7 +169,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON break; } } - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -181,13 +181,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { data01 = CMULT(*(ao1 + 0), ZERO); } - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zhemm3m_ucopy_8.c b/kernel/generic/zhemm3m_ucopy_8.c index efed390a7..151422f3d 100644 --- a/kernel/generic/zhemm3m_ucopy_8.c +++ b/kernel/generic/zhemm3m_ucopy_8.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -334,7 +334,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; diff --git a/kernel/generic/zhemm_ltcopy_1.c b/kernel/generic/zhemm_ltcopy_1.c index 6f5615b79..b5edda6d7 100644 --- a/kernel/generic/zhemm_ltcopy_1.c +++ b/kernel/generic/zhemm_ltcopy_1.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { diff --git a/kernel/generic/zhemm_ltcopy_2.c b/kernel/generic/zhemm_ltcopy_2.c index 8547b4d68..41713b03f 100644 --- a/kernel/generic/zhemm_ltcopy_2.c +++ b/kernel/generic/zhemm_ltcopy_2.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { diff --git a/kernel/generic/zhemm_ltcopy_4.c b/kernel/generic/zhemm_ltcopy_4.c index d7afc1174..88fa6ef9a 100644 --- a/kernel/generic/zhemm_ltcopy_4.c +++ b/kernel/generic/zhemm_ltcopy_4.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -70,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; @@ -153,7 +153,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -209,7 +209,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -217,7 +217,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { diff --git a/kernel/generic/zhemm_ltcopy_8.c b/kernel/generic/zhemm_ltcopy_8.c index d5ebd1c81..d3f35a7d3 100644 --- a/kernel/generic/zhemm_ltcopy_8.c +++ b/kernel/generic/zhemm_ltcopy_8.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao7 + 1); data15 = *(ao8 + 0); data16 = *(ao8 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; @@ -289,7 +289,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -306,7 +306,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; @@ -388,7 +388,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -399,7 +399,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -444,7 +444,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -452,7 +452,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > 0) { diff --git a/kernel/generic/zhemm_utcopy_1.c b/kernel/generic/zhemm_utcopy_1.c index 961b8497e..76e67b054 100644 --- a/kernel/generic/zhemm_utcopy_1.c +++ b/kernel/generic/zhemm_utcopy_1.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = n; while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -59,7 +59,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { diff --git a/kernel/generic/zhemm_utcopy_2.c b/kernel/generic/zhemm_utcopy_2.c index 91e7108b4..bd6f13952 100644 --- a/kernel/generic/zhemm_utcopy_2.c +++ b/kernel/generic/zhemm_utcopy_2.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = (n >> 1); while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -62,7 +62,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -115,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { diff --git a/kernel/generic/zhemm_utcopy_4.c b/kernel/generic/zhemm_utcopy_4.c index 15671b44a..6201b4331 100644 --- a/kernel/generic/zhemm_utcopy_4.c +++ b/kernel/generic/zhemm_utcopy_4.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON break; } } - + b += 8; offset --; @@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -207,7 +207,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -215,7 +215,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { diff --git a/kernel/generic/zhemm_utcopy_8.c b/kernel/generic/zhemm_utcopy_8.c index 1cfd3bd59..601ef263d 100644 --- a/kernel/generic/zhemm_utcopy_8.c +++ b/kernel/generic/zhemm_utcopy_8.c @@ -40,7 +40,7 @@ #include "common.h" int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - + BLASLONG i, js, offset; FLOAT data01, data02, data03, data04, data05, data06, data07, data08; @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao7 + 1); data15 = *(ao8 + 0); data16 = *(ao8 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON break; } } - + b += 16; offset --; @@ -288,7 +288,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -305,7 +305,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -374,7 +374,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON break; } } - + b += 8; offset --; @@ -387,7 +387,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -398,7 +398,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -442,7 +442,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -450,7 +450,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > 0) { diff --git a/kernel/generic/zhemv_k.c b/kernel/generic/zhemv_k.c index 3551938da..bab1d6b18 100644 --- a/kernel/generic/zhemv_k.c +++ b/kernel/generic/zhemv_k.c @@ -41,7 +41,7 @@ #include "common.h" #include "symcopy.h" -int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ BLASLONG is, min_i; @@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, #ifndef LOWER if (is > 0){ #ifndef HEMVREV - GEMV_C(is, min_i, 0, alpha_r, alpha_i, + GEMV_C(is, min_i, 0, alpha_r, alpha_i, a + is * lda * 2, lda, X, 1, Y + is * 2, 1, gemvbuffer); @@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, X + is * 2, 1, Y, 1, gemvbuffer); #else - GEMV_T(is, min_i, 0, alpha_r, alpha_i, + GEMV_T(is, min_i, 0, alpha_r, alpha_i, a + is * lda * 2, lda, X, 1, Y + is * 2, 1, gemvbuffer); @@ -113,11 +113,11 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, #endif #endif - GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, + GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, symbuffer, min_i, - X + is * 2, 1, + X + is * 2, 1, Y + is * 2, 1, gemvbuffer); - + #ifdef LOWER if (m - is - min_i > 0){ diff --git a/kernel/generic/zlaswp_ncopy_1.c b/kernel/generic/zlaswp_ncopy_1.c index acbda68fd..0e1509997 100644 --- a/kernel/generic/zlaswp_ncopy_1.c +++ b/kernel/generic/zlaswp_ncopy_1.c @@ -55,24 +55,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint k1 --; ipiv += k1; - + if (n <= 0) return 0; - + j = n; do { piv = ipiv; - + a1 = a + (k1 + 1) * 2; - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *(a1 + 0); @@ -83,11 +83,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -103,7 +103,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; @@ -126,7 +126,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 3) = A4; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; @@ -145,26 +145,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b2 + 1) = A4; } } - + buffer += 4; - + b1 = a + ip1; b2 = a + ip2; - + a1 += 4; - + i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); - B1 = *(b1 + 0); + B1 = *(b1 + 0); B2 = *(b1 + 1); - + if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; @@ -182,5 +182,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } while (j > 0); return 0; -} +} diff --git a/kernel/generic/zlaswp_ncopy_2.c b/kernel/generic/zlaswp_ncopy_2.c index 7fa56be21..d02a788b3 100644 --- a/kernel/generic/zlaswp_ncopy_2.c +++ b/kernel/generic/zlaswp_ncopy_2.c @@ -60,27 +60,27 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint ipiv += k1; if (n <= 0) return 0; - + j = (n >> 1); if (j > 0) { do { piv = ipiv; - + a1 = a + (k1 + 1) * 2; a3 = a1 + lda; - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + lda; b4 = b2 + lda; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *(a1 + 0); @@ -104,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -124,13 +124,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; - + *(b2 + 0) = A3; *(b2 + 1) = A4; *(b4 + 0) = A7; *(b4 + 1) = A8; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; @@ -171,7 +171,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b1 + 1) = A2; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; @@ -205,24 +205,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b4 + 1) = A8; } } - + buffer += 8; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + lda; b4 = b2 + lda; - + a1 += 4; a3 += 4; - + i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); @@ -232,7 +232,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint A4 = *(a3 + 1); B3 = *(b3 + 0); B4 = *(b3 + 1); - + if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; @@ -251,26 +251,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } buffer += 4; } - + a += 2 * lda; j --; } while (j > 0); } - + if (n & 1) { piv = ipiv; - + a1 = a + (k1 + 1) * 2; - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *(a1 + 0); @@ -281,11 +281,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -297,11 +297,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 1) = A2; *(buffer + 2) = B3; *(buffer + 3) = B4; - + *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; @@ -324,7 +324,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 3) = A4; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; @@ -345,24 +345,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } buffer += 4; - + b1 = a + ip1; b2 = a + ip2; a1 += 4; - + i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); - B1 = *(b1 + 0); + B1 = *(b1 + 0); B2 = *(b1 + 1); - + if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; @@ -377,5 +377,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } return 0; -} +} diff --git a/kernel/generic/zlaswp_ncopy_4.c b/kernel/generic/zlaswp_ncopy_4.c index c9c44fcab..b79166692 100644 --- a/kernel/generic/zlaswp_ncopy_4.c +++ b/kernel/generic/zlaswp_ncopy_4.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint piv = ipiv; a1 = a + (k1 + 1) * 2; - + a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; @@ -79,10 +79,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; @@ -91,7 +91,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint b8 = b2 + 3 * lda; i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *(a1 + 0); @@ -131,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -179,7 +179,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b8 + 0) = A8; *(b8 + 1) = A16; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A2; @@ -253,7 +253,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b5 + 1) = A13; *(b7 + 0) = A7; *(b7 + 1) = A15; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B9; @@ -316,19 +316,19 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b8 + 1) = A16; } } - + buffer += 16; b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - + a1 += 4; a3 += 4; a5 += 4; @@ -337,9 +337,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *(a1 + 0); A9 = *(a1 + 1); @@ -390,29 +390,29 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } a += 4 * lda; - + j --; } while (j > 0); } if (n & 2) { piv = ipiv; - + a1 = a + (k1 + 1) * 2; a3 = a1 + lda; - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + lda; b4 = b2 + lda; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *(a1 + 0); @@ -423,7 +423,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint A6 = *(a3 + 1); A7 = *(a4 + 0); A8 = *(a4 + 1); - + B1 = *(b1 + 0); B2 = *(b1 + 1); B3 = *(b2 + 0); @@ -432,11 +432,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -456,13 +456,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 5) = B4; *(buffer + 6) = B7; *(buffer + 7) = B8; - + *(b2 + 0) = A3; *(b2 + 1) = A4; *(b4 + 0) = A7; *(b4 + 1) = A8; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; @@ -503,7 +503,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b1 + 1) = A2; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; @@ -537,24 +537,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(b4 + 1) = A8; } } - + buffer += 8; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + lda; b4 = b2 + lda; - + a1 += 4; a3 += 4; - + i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); @@ -564,13 +564,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint A4 = *(a3 + 1); B3 = *(b3 + 0); B4 = *(b3 + 1); - + if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; *(buffer + 2) = A3; *(buffer + 3) = A4; - + } else { *(buffer + 0) = B1; *(buffer + 1) = B2; @@ -583,24 +583,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } buffer += 4; } - + a += 2 * lda; } - + if (n & 1) { piv = ipiv; - + a1 = a + (k1 + 1) * 2; - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + b1 = a + ip1; b2 = a + ip2; - + i = ((k2 - k1) >> 1); - + if (i > 0) { do { A1 = *(a1 + 0); @@ -611,11 +611,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + ip1 = *(piv + 0) * 2; ip2 = *(piv + 1) * 2; piv += 2; - + if (b1 == a1) { if (b2 == a2) { *(buffer + 0) = A1; @@ -627,11 +627,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 1) = A2; *(buffer + 2) = B3; *(buffer + 3) = B4; - + *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 == a2) { *(buffer + 0) = A3; @@ -654,7 +654,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *(buffer + 3) = A4; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(buffer + 0) = B1; *(buffer + 1) = B2; @@ -675,24 +675,24 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } buffer += 4; - + b1 = a + ip1; b2 = a + ip2; a1 += 4; - + i --; } while (i > 0); } - + i = ((k2 - k1) & 1); - + if (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); - B1 = *(b1 + 0); + B1 = *(b1 + 0); B2 = *(b1 + 1); - + if (a1 == b1) { *(buffer + 0) = A1; *(buffer + 1) = A2; @@ -707,5 +707,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint } return 0; -} +} diff --git a/kernel/generic/zneg_tcopy_1.c b/kernel/generic/zneg_tcopy_1.c index 3701c9cff..6b75e1486 100644 --- a/kernel/generic/zneg_tcopy_1.c +++ b/kernel/generic/zneg_tcopy_1.c @@ -49,18 +49,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset = a; b_offset = b; - + lda *= 2; j = m; m *= 2; - + if (j > 0){ do { b_offset1 = b_offset; b_offset += 2; - + i = (n >> 2); if (i > 0){ do{ @@ -68,45 +68,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); - + ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); - + *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; - + b_offset1 += m; - + *(b_offset1 + 0) = -ctemp3; *(b_offset1 + 1) = -ctemp4; - + b_offset1 += m; - + *(b_offset1 + 0) = -ctemp5; *(b_offset1 + 1) = -ctemp6; b_offset1 += m; - + *(b_offset1 + 0) = -ctemp7; *(b_offset1 + 1) = -ctemp8; - + b_offset1 += m; a_offset += 8; i --; } while(i>0); } - + i = (n & 3); if (i > 0){ do { ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); - + *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; - + b_offset1 += m; a_offset += 2; i --; diff --git a/kernel/generic/zneg_tcopy_2.c b/kernel/generic/zneg_tcopy_2.c index 40dd1151d..074f2f1fb 100644 --- a/kernel/generic/zneg_tcopy_2.c +++ b/kernel/generic/zneg_tcopy_2.c @@ -51,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ a_offset = a; b_offset = b; - + b_offset2 = b + m * (n & ~1) * 2; lda *= 2; @@ -73,46 +73,46 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp5 = *(a_offset1 + 4); ctemp6 = *(a_offset1 + 5); ctemp7 = *(a_offset1 + 6); ctemp8 = *(a_offset1 + 7); - + ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); - + ctemp13 = *(a_offset2 + 4); ctemp14 = *(a_offset2 + 5); ctemp15 = *(a_offset2 + 6); ctemp16 = *(a_offset2 + 7); - + *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; - + *(b_offset1 + 4) = -ctemp9; *(b_offset1 + 5) = -ctemp10; *(b_offset1 + 6) = -ctemp11; *(b_offset1 + 7) = -ctemp12; - + b_offset1 += m * 4; - + *(b_offset1 + 0) = -ctemp5; *(b_offset1 + 1) = -ctemp6; *(b_offset1 + 2) = -ctemp7; *(b_offset1 + 3) = -ctemp8; - + *(b_offset1 + 4) = -ctemp13; *(b_offset1 + 5) = -ctemp14; *(b_offset1 + 6) = -ctemp15; *(b_offset1 + 7) = -ctemp16; - + b_offset1 += m * 4; - + a_offset1 += 8; a_offset2 += 8; i --; @@ -124,33 +124,33 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset1 + 1); ctemp3 = *(a_offset1 + 2); ctemp4 = *(a_offset1 + 3); - + ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); ctemp11 = *(a_offset2 + 2); ctemp12 = *(a_offset2 + 3); - + *(b_offset1 + 0) = -ctemp1; *(b_offset1 + 1) = -ctemp2; *(b_offset1 + 2) = -ctemp3; *(b_offset1 + 3) = -ctemp4; - + *(b_offset1 + 4) = -ctemp9; *(b_offset1 + 5) = -ctemp10; *(b_offset1 + 6) = -ctemp11; *(b_offset1 + 7) = -ctemp12; - + b_offset1 += m * 4; a_offset1 += 4; a_offset2 += 4; } - + if (n & 1){ ctemp1 = *(a_offset1 + 0); ctemp2 = *(a_offset1 + 1); ctemp9 = *(a_offset2 + 0); ctemp10 = *(a_offset2 + 1); - + *(b_offset2 + 0) = -ctemp1; *(b_offset2 + 1) = -ctemp2; *(b_offset2 + 2) = -ctemp9; @@ -169,45 +169,45 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); - + ctemp5 = *(a_offset + 4); ctemp6 = *(a_offset + 5); ctemp7 = *(a_offset + 6); ctemp8 = *(a_offset + 7); - + *(b_offset + 0) = -ctemp1; *(b_offset + 1) = -ctemp2; *(b_offset + 2) = -ctemp3; *(b_offset + 3) = -ctemp4; - + b_offset += m * 4; - + *(b_offset + 0) = -ctemp5; *(b_offset + 1) = -ctemp6; *(b_offset + 2) = -ctemp7; *(b_offset + 3) = -ctemp8; - + b_offset += m * 4; a_offset += 8; i --; } while(i > 0); } - + if (n & 2){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); ctemp3 = *(a_offset + 2); ctemp4 = *(a_offset + 3); - + *(b_offset + 0) = -ctemp1; *(b_offset + 1) = -ctemp2; *(b_offset + 2) = -ctemp3; *(b_offset + 3) = -ctemp4; - + b_offset += m * 4; a_offset += 4; } - + if (n & 1){ ctemp1 = *(a_offset + 0); ctemp2 = *(a_offset + 1); diff --git a/kernel/generic/zneg_tcopy_4.c b/kernel/generic/zneg_tcopy_4.c index 7cd988754..cfdd23bfd 100644 --- a/kernel/generic/zneg_tcopy_4.c +++ b/kernel/generic/zneg_tcopy_4.c @@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - + ctemp17 = *(aoffset3 + 0); ctemp18 = *(aoffset3 + 1); ctemp19 = *(aoffset3 + 2); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp22 = *(aoffset3 + 5); ctemp23 = *(aoffset3 + 6); ctemp24 = *(aoffset3 + 7); - + ctemp25 = *(aoffset4 + 0); ctemp26 = *(aoffset4 + 1); ctemp27 = *(aoffset4 + 2); @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; - + *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; @@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 21) = -ctemp22; *(boffset1 + 22) = -ctemp23; *(boffset1 + 23) = -ctemp24; - + *(boffset1 + 24) = -ctemp25; *(boffset1 + 25) = -ctemp26; *(boffset1 + 26) = -ctemp27; @@ -174,17 +174,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); - + ctemp09 = *(aoffset3 + 0); ctemp10 = *(aoffset3 + 1); ctemp11 = *(aoffset3 + 2); ctemp12 = *(aoffset3 + 3); - + ctemp13 = *(aoffset4 + 0); ctemp14 = *(aoffset4 + 1); ctemp15 = *(aoffset4 + 2); ctemp16 = *(aoffset4 + 3); - + *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; - + *(boffset2 + 8) = -ctemp09; *(boffset2 + 9) = -ctemp10; *(boffset2 + 10) = -ctemp11; @@ -202,12 +202,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 13) = -ctemp14; *(boffset2 + 14) = -ctemp15; *(boffset2 + 15) = -ctemp16; - + aoffset1 += 4; aoffset2 += 4; aoffset3 += 4; aoffset4 += 4; - + boffset2 += 16; } @@ -217,13 +217,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + ctemp05 = *(aoffset3 + 0); ctemp06 = *(aoffset3 + 1); - + ctemp07 = *(aoffset4 + 0); ctemp08 = *(aoffset4 + 1); - + *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; @@ -232,12 +232,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset3 + 5) = -ctemp06; *(boffset3 + 6) = -ctemp07; *(boffset3 + 7) = -ctemp08; - + aoffset1 += 2; aoffset2 += 2; aoffset3 += 2; aoffset4 += 2; - + boffset3 += 8; } j--; @@ -248,10 +248,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset1 + lda; aoffset += 2 * lda; - + boffset1 = boffset; boffset += 16; - + i = (n >> 2); if (i > 0){ do{ @@ -263,7 +263,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -272,7 +272,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset2 + 5); ctemp15 = *(aoffset2 + 6); ctemp16 = *(aoffset2 + 7); - + *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; - + *(boffset1 + 8) = -ctemp09; *(boffset1 + 9) = -ctemp10; *(boffset1 + 10) = -ctemp11; @@ -290,12 +290,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 13) = -ctemp14; *(boffset1 + 14) = -ctemp15; *(boffset1 + 15) = -ctemp16; - + aoffset1 += 8; aoffset2 += 8; aoffset3 += 8; aoffset4 += 8; - + boffset1 += m * 8; i --; }while(i > 0); @@ -306,12 +306,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + ctemp05 = *(aoffset2 + 0); ctemp06 = *(aoffset2 + 1); ctemp07 = *(aoffset2 + 2); ctemp08 = *(aoffset2 + 3); - + *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; @@ -320,34 +320,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset2 + 5) = -ctemp06; *(boffset2 + 6) = -ctemp07; *(boffset2 + 7) = -ctemp08; - + aoffset1 += 4; aoffset2 += 4; - + boffset2 += 8; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset2 + 0); ctemp04 = *(aoffset2 + 1); - + *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; *(boffset3 + 2) = -ctemp03; *(boffset3 + 3) = -ctemp04; - + aoffset1 += 2; aoffset2 += 2; boffset3 += 4; } } - + if (m & 1){ aoffset1 = aoffset; boffset1 = boffset; - + i = (n >> 2); if (i > 0){ do{ @@ -359,7 +359,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + *(boffset1 + 0) = -ctemp01; *(boffset1 + 1) = -ctemp02; *(boffset1 + 2) = -ctemp03; @@ -368,7 +368,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset1 + 5) = -ctemp06; *(boffset1 + 6) = -ctemp07; *(boffset1 + 7) = -ctemp08; - + aoffset1 += 8; boffset1 += m * 8; i --; @@ -380,7 +380,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp02 = *(aoffset1 + 1); ctemp03 = *(aoffset1 + 2); ctemp04 = *(aoffset1 + 3); - + *(boffset2 + 0) = -ctemp01; *(boffset2 + 1) = -ctemp02; *(boffset2 + 2) = -ctemp03; @@ -389,11 +389,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 += 4; boffset2 += 4; } - + if (n & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); - + *(boffset3 + 0) = -ctemp01; *(boffset3 + 1) = -ctemp02; } diff --git a/kernel/generic/zneg_tcopy_8.c b/kernel/generic/zneg_tcopy_8.c index fe8f25cba..cb1a62d60 100644 --- a/kernel/generic/zneg_tcopy_8.c +++ b/kernel/generic/zneg_tcopy_8.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 16; - + i = (m >> 1); if (i > 0){ do{ @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; - + *(boffset + 16) = -ctemp17; *(boffset + 17) = -ctemp18; *(boffset + 18) = -ctemp19; @@ -170,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp14 = *(aoffset1 + 13); ctemp15 = *(aoffset1 + 14); ctemp16 = *(aoffset1 + 15); - + *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; @@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 8; - + i = (m >> 1); if (i > 0){ do{ @@ -212,7 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + ctemp09 = *(aoffset2 + 0); ctemp10 = *(aoffset2 + 1); ctemp11 = *(aoffset2 + 2); @@ -230,7 +230,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + *(boffset + 8) = -ctemp09; *(boffset + 9) = -ctemp10; *(boffset + 10) = -ctemp11; @@ -239,15 +239,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 13) = -ctemp14; *(boffset + 14) = -ctemp15; *(boffset + 15) = -ctemp16; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 16; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -257,7 +257,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ ctemp06 = *(aoffset1 + 5); ctemp07 = *(aoffset1 + 6); ctemp08 = *(aoffset1 + 7); - + *(boffset + 0) = -ctemp01; *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; @@ -266,7 +266,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + boffset += 8; } } @@ -275,7 +275,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 4; - + i = (m >> 1); if (i > 0){ do{ @@ -297,15 +297,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 5) = -ctemp06; *(boffset + 6) = -ctemp07; *(boffset + 7) = -ctemp08; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 8; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); @@ -316,7 +316,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; - + boffset += 4; } } @@ -325,7 +325,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ aoffset1 = aoffset; aoffset2 = aoffset + lda; aoffset += 2; - + i = (m >> 1); if (i > 0){ do{ @@ -338,15 +338,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ *(boffset + 1) = -ctemp02; *(boffset + 2) = -ctemp03; *(boffset + 3) = -ctemp04; - + aoffset1 += 2 * lda; aoffset2 += 2 * lda; boffset += 4; - + i --; }while(i > 0); } - + if (m & 1){ ctemp01 = *(aoffset1 + 0); ctemp02 = *(aoffset1 + 1); diff --git a/kernel/generic/zsymm3m_lcopy_1.c b/kernel/generic/zsymm3m_lcopy_1.c index 0e0d5a3e3..4e5b29d2d 100644 --- a/kernel/generic/zsymm3m_lcopy_1.c +++ b/kernel/generic/zsymm3m_lcopy_1.c @@ -69,31 +69,31 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao1; js = n; - + while (js > 0){ - + offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; - + i = m; - + while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; - + b[ 0] = data01; - + b ++; offset --; i --; } - + posX ++; js --; } - + return 0; } diff --git a/kernel/generic/zsymm3m_lcopy_2.c b/kernel/generic/zsymm3m_lcopy_2.c index 96686c1e4..edab3a477 100644 --- a/kernel/generic/zsymm3m_lcopy_2.c +++ b/kernel/generic/zsymm3m_lcopy_2.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -101,14 +101,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; diff --git a/kernel/generic/zsymm3m_lcopy_4.c b/kernel/generic/zsymm3m_lcopy_4.c index 38a58cfcc..9c6f51f64 100644 --- a/kernel/generic/zsymm3m_lcopy_4.c +++ b/kernel/generic/zsymm3m_lcopy_4.c @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = (n >> 2); while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -116,7 +116,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -134,14 +134,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; diff --git a/kernel/generic/zsymm3m_lcopy_8.c b/kernel/generic/zsymm3m_lcopy_8.c index 4e5cddcda..f38509210 100644 --- a/kernel/generic/zsymm3m_lcopy_8.c +++ b/kernel/generic/zsymm3m_lcopy_8.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; @@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -159,7 +159,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -168,7 +168,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -186,14 +186,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; diff --git a/kernel/generic/zsymm3m_ucopy_1.c b/kernel/generic/zsymm3m_ucopy_1.c index 14ca6e76e..8bf4c8321 100644 --- a/kernel/generic/zsymm3m_ucopy_1.c +++ b/kernel/generic/zsymm3m_ucopy_1.c @@ -67,29 +67,29 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao1; lda *= 2; - + js = n; - + while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; - + i = m; - + while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; - + b[ 0] = data01; - + b ++; - + offset --; i --; } - + posX ++; js --; } diff --git a/kernel/generic/zsymm3m_ucopy_2.c b/kernel/generic/zsymm3m_ucopy_2.c index 4ba1e6996..deed9eedc 100644 --- a/kernel/generic/zsymm3m_ucopy_2.c +++ b/kernel/generic/zsymm3m_ucopy_2.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -93,21 +93,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; js --; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zsymm3m_ucopy_4.c b/kernel/generic/zsymm3m_ucopy_4.c index 8de026a5c..5737c0c83 100644 --- a/kernel/generic/zsymm3m_ucopy_4.c +++ b/kernel/generic/zsymm3m_ucopy_4.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -129,20 +129,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zsymm3m_ucopy_8.c b/kernel/generic/zsymm3m_ucopy_8.c index 79ef3649c..3aa1b078c 100644 --- a/kernel/generic/zsymm3m_ucopy_8.c +++ b/kernel/generic/zsymm3m_ucopy_8.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -125,7 +125,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -160,7 +160,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -169,7 +169,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -181,20 +181,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON offset --; i --; } - + posX += 2; } if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; while (i > 0) { data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zsymm_lcopy_1.c b/kernel/generic/zsymm_lcopy_1.c index 1b4f58d53..7f20a1f0a 100644 --- a/kernel/generic/zsymm_lcopy_1.c +++ b/kernel/generic/zsymm_lcopy_1.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -61,7 +61,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; diff --git a/kernel/generic/zsymm_lcopy_2.c b/kernel/generic/zsymm_lcopy_2.c index ce1b16e9c..735e8e739 100644 --- a/kernel/generic/zsymm_lcopy_2.c +++ b/kernel/generic/zsymm_lcopy_2.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -86,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; diff --git a/kernel/generic/zsymm_lcopy_4.c b/kernel/generic/zsymm_lcopy_4.c index dd2034d44..d2acea3de 100644 --- a/kernel/generic/zsymm_lcopy_4.c +++ b/kernel/generic/zsymm_lcopy_4.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -70,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; @@ -98,7 +98,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -131,7 +131,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; diff --git a/kernel/generic/zsymm_lcopy_8.c b/kernel/generic/zsymm_lcopy_8.c index 33976124f..e3fbcb56c 100644 --- a/kernel/generic/zsymm_lcopy_8.c +++ b/kernel/generic/zsymm_lcopy_8.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -121,7 +121,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; @@ -138,7 +138,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; if (offset > -2) ao3 += lda; else ao3 += 2; @@ -165,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; @@ -176,7 +176,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; if (offset > -1) ao2 += lda; else ao2 += 2; @@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; i = m; @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += lda; else ao1 += 2; b[ 0] = data01; diff --git a/kernel/generic/zsymm_ucopy_1.c b/kernel/generic/zsymm_ucopy_1.c index 9943a2dad..d93b572da 100644 --- a/kernel/generic/zsymm_ucopy_1.c +++ b/kernel/generic/zsymm_ucopy_1.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; diff --git a/kernel/generic/zsymm_ucopy_2.c b/kernel/generic/zsymm_ucopy_2.c index da64cde15..4d948f7dd 100644 --- a/kernel/generic/zsymm_ucopy_2.c +++ b/kernel/generic/zsymm_ucopy_2.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -93,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zsymm_ucopy_4.c b/kernel/generic/zsymm_ucopy_4.c index eed0bcacb..8cc326a09 100644 --- a/kernel/generic/zsymm_ucopy_4.c +++ b/kernel/generic/zsymm_ucopy_4.c @@ -52,7 +52,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -129,7 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -137,7 +137,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zsymm_ucopy_8.c b/kernel/generic/zsymm_ucopy_8.c index c81a7a890..ea86676c7 100644 --- a/kernel/generic/zsymm_ucopy_8.c +++ b/kernel/generic/zsymm_ucopy_8.c @@ -53,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (js > 0){ offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao7 + 1); data15 = *(ao8 + 0); data16 = *(ao8 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; @@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao3 + 1); data07 = *(ao4 + 0); data08 = *(ao4 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; if (offset > -2) ao3 += 2; else ao3 += lda; @@ -166,7 +166,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; if (offset > -1) ao2 += 2; else ao2 += lda; @@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1) { offset = posX - posY; - + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; i = m; @@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON while (i > 0) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + if (offset > 0) ao1 += 2; else ao1 += lda; b[ 0] = data01; diff --git a/kernel/generic/zsymv_k.c b/kernel/generic/zsymv_k.c index 211def30f..1e762eb56 100644 --- a/kernel/generic/zsymv_k.c +++ b/kernel/generic/zsymv_k.c @@ -72,14 +72,14 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, for(is = 0; is < offset; is += SYMV_P){ min_i = MIN(offset - is, SYMV_P); #endif - + #ifndef LOWER if (is >0){ GEMV_T(is, min_i, 0, alpha_r, alpha_i, a + is * lda * COMPSIZE, lda, X, 1, Y + is * COMPSIZE, 1, gemvbuffer); - + GEMV_N(is, min_i, 0, alpha_r, alpha_i, a + is * lda * COMPSIZE, lda, X + is * COMPSIZE, 1, @@ -92,12 +92,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, #else ZSYMCOPY_U(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer); #endif - + GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, symbuffer, min_i, X + is * COMPSIZE, 1, Y + is * COMPSIZE, 1, gemvbuffer); - + #ifdef LOWER if (m - is > min_i){ @@ -105,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, a + ((is + min_i) + is * lda) * COMPSIZE, lda, X + (is + min_i) * COMPSIZE, 1, Y + is * COMPSIZE, 1, gemvbuffer); - + GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i, a + ((is + min_i) + is * lda) * COMPSIZE, lda, X + is * COMPSIZE, 1, diff --git a/kernel/generic/ztrmm_lncopy_1.c b/kernel/generic/ztrmm_lncopy_1.c index 15a05090b..f0f8827f2 100644 --- a/kernel/generic/ztrmm_lncopy_1.c +++ b/kernel/generic/ztrmm_lncopy_1.c @@ -72,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; b += 2; - } else + } else if (X < posY) { ao1 += lda; b += 2; diff --git a/kernel/generic/ztrmm_lncopy_2.c b/kernel/generic/ztrmm_lncopy_2.c index f41ee5b93..c620c78c1 100644 --- a/kernel/generic/ztrmm_lncopy_2.c +++ b/kernel/generic/ztrmm_lncopy_2.c @@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 4; ao2 += 4; b += 8; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -136,13 +136,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -151,7 +151,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X < posY) { ao1 += lda; b += 4; @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = data02; b += 2; ao1 += 2; - } else + } else if (X < posY) { b += 2; ao1 += lda; diff --git a/kernel/generic/ztrmm_lncopy_4.c b/kernel/generic/ztrmm_lncopy_4.c index 76170c766..5442105c5 100644 --- a/kernel/generic/ztrmm_lncopy_4.c +++ b/kernel/generic/ztrmm_lncopy_4.c @@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + b[ 8] = data03; b[ 9] = data04; b[10] = data11; @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data20; b[14] = data27; b[15] = data28; - + b[16] = data05; b[17] = data06; b[18] = data13; @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data29; b[23] = data30; - + b[24] = data07; b[25] = data08; b[26] = data15; @@ -144,14 +144,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[29] = data24; b[30] = data31; b[31] = data32; - + ao1 += 8; ao2 += 8; ao3 += 8; ao4 += 8; b += 32; - } else + } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -167,15 +167,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -184,7 +184,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data03; b[ 9] = data04; b[10] = ONE; @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; - + b[16] = data05; b[17] = data06; b[18] = data13; @@ -202,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; - + b[24] = data07; b[25] = data08; b[26] = data15; @@ -220,22 +220,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; @@ -244,7 +244,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data03; b[ 9] = data04; b[10] = data11; @@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; - + b[16] = data05; b[17] = data06; b[18] = data13; @@ -262,7 +262,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = ZERO; b[23] = ZERO; - + b[24] = data07; b[25] = data08; b[26] = data15; @@ -286,7 +286,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { @@ -294,22 +294,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -318,7 +318,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + b[ 8] = data03; b[ 9] = data04; b[10] = data11; @@ -327,14 +327,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data20; b[14] = data27; b[15] = data28; - + ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -345,7 +345,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -354,27 +354,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } - - } else + + } else if (X < posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 16; } - + if (m & 1) { ao1 += lda; b += 8; } - + } else { #ifdef UNIT @@ -389,7 +389,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data13 = *(ao2 + 4); data14 = *(ao2 + 5); } - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - + if (i >= 3) { b[ 0] = data05; b[ 1] = data06; @@ -442,7 +442,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data21 = *(ao3 + 4); data22 = *(ao3 + 5); } - + b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; @@ -464,7 +464,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - + if (i >= 3) { b[ 0] = data05; b[ 1] = data06; @@ -505,12 +505,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -524,17 +524,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; b += 8; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; - + b += 8; } else { #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -548,10 +548,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; @@ -563,7 +563,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif ao1 += 4; ao2 += 4; - + b += 8; } @@ -574,13 +574,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -589,7 +589,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X < posY) { ao1 += lda; b += 4; @@ -651,7 +651,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += lda; b += 2; } - + X ++; i --; } while (i > 0); diff --git a/kernel/generic/ztrmm_lncopy_8.c b/kernel/generic/ztrmm_lncopy_8.c index 308ddd75f..71d3bf1ce 100644 --- a/kernel/generic/ztrmm_lncopy_8.c +++ b/kernel/generic/ztrmm_lncopy_8.c @@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X > posY) { for (ii = 0; ii < 8; ii++){ - + b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); @@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); @@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao8 += 2; b += 16; } - } else + } else if (X < posY) { ao1 += 8 * lda; ao2 += 8 * lda; @@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 128; } else { @@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; - + b[ 16] = *(ao1 + 2); b[ 17] = *(ao1 + 3); #ifdef UNIT @@ -297,7 +297,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[126] = *(ao8 + 14); b[127] = *(ao8 + 15); #endif - + ao1 += 16; ao2 += 16; ao3 += 16; @@ -316,7 +316,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X > posY) { for (ii = 0; ii < i; ii++){ @@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); @@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; @@ -348,7 +348,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao8 += 2; b += 16; } - } else + } else if (X < posY) { ao1 += i * lda; ao2 += i * lda; @@ -569,14 +569,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } - } else + } else if (X < posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -597,7 +597,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = *(ao1 + 2); b[ 9] = *(ao1 + 3); #ifdef UNIT @@ -654,7 +654,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { for (ii = 0; ii < i; ii++){ @@ -666,14 +666,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } - } else + } else if (X < posY) { ao1 += i * lda; ao2 += i * lda; @@ -695,7 +695,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = ZERO; b[ 7] = ZERO; b += 8; - + if (i >= 2) { b[ 0] = *(ao1 + 2); b[ 1] = *(ao1 + 3); @@ -758,11 +758,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao1 + 3); b[ 6] = *(ao2 + 2); b[ 7] = *(ao2 + 3); - + ao1 += 4; ao2 += 4; b += 8; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -777,7 +777,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif b[ 2] = ZERO; b[ 3] = ZERO; - + b[ 4] = *(ao1 + 2); b[ 5] = *(ao1 + 3); #ifdef UNIT @@ -798,7 +798,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); @@ -807,7 +807,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X < posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -823,7 +823,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 2] = ZERO; b[ 3] = ZERO; b += 4; - } + } } posY += 2; } @@ -845,7 +845,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = *(ao1 + 1); ao1 += 2; b += 2; - } else + } else if (X < posY) { ao1 += lda; b += 2; diff --git a/kernel/generic/ztrmm_ltcopy_1.c b/kernel/generic/ztrmm_ltcopy_1.c index 1229b4587..2fcd8dbd2 100644 --- a/kernel/generic/ztrmm_ltcopy_1.c +++ b/kernel/generic/ztrmm_ltcopy_1.c @@ -66,11 +66,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X > posY) { ao1 += 2; b += 2; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; ao1 += lda; @@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; #endif diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c index 7bcadf3f1..457890ceb 100644 --- a/kernel/generic/ztrmm_ltcopy_2.c +++ b/kernel/generic/ztrmm_ltcopy_2.c @@ -72,18 +72,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; b += 8; - } else + } else if (X < posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); - + data5 = *(ao2 + 0); data6 = *(ao2 + 1); data7 = *(ao2 + 2); data8 = *(ao2 + 3); - + b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; @@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data6; b[ 6] = data7; b[ 7] = data8; - + ao1 += 2 * lda; ao2 += 2 * lda; b += 8; @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifdef UNIT data3 = *(ao1 + 2); data4 = *(ao1 + 3); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data3; @@ -115,10 +115,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); - + data7 = *(ao2 + 2); data8 = *(ao2 + 3); - + b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; @@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = data7; b[ 7] = data8; -#endif +#endif ao1 += 4; ao2 += 4; b += 8; @@ -139,19 +139,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X > posY) { ao1 += 2; ao2 += 2; b += 4; - - } else + + } else if (X < posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); - + b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; @@ -208,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); - + b[ 0] = data1; b[ 1] = data2; b += 2; @@ -220,7 +220,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); - + b[ 0] = data1; b[ 1] = data2; #endif diff --git a/kernel/generic/ztrmm_ltcopy_4.c b/kernel/generic/ztrmm_ltcopy_4.c index e43ed1269..42a809ba4 100644 --- a/kernel/generic/ztrmm_ltcopy_4.c +++ b/kernel/generic/ztrmm_ltcopy_4.c @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 8; b += 32; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + b[16] = data17; b[17] = data18; b[18] = data19; @@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -153,7 +153,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[29] = data30; b[30] = data31; b[31] = data32; - + ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; @@ -168,16 +168,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data23 = *(ao3 + 6); data24 = *(ao3 + 7); - - + + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; @@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; @@ -195,7 +195,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; @@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = ZERO; b[22] = data23; b[23] = data24; - + b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; @@ -222,22 +222,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data11 = *(ao2 + 2); data12 = *(ao2 + 3); data13 = *(ao2 + 4); data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data21 = *(ao3 + 4); data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -246,7 +246,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; @@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; @@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; @@ -288,7 +288,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X > posY) { if (m & 2) { @@ -298,7 +298,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 4; b += 16; } - + if (m & 1) { ao1 += 2; ao2 += 2; @@ -306,8 +306,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - - } else + + } else if (X < posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -318,7 +318,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -327,7 +327,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -336,7 +336,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -345,13 +345,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + ao1 += 2 * lda; ao2 += 2 * lda; - + b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -361,7 +361,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -370,11 +370,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + ao1 += lda; b += 8; } - + } else { #ifdef UNIT data03 = *(ao1 + 2); @@ -383,7 +383,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + if (i >= 2) { data13 = *(ao2 + 4); data14 = *(ao2 + 5); @@ -395,7 +395,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data23 = *(ao3 + 6); data24 = *(ao3 + 7); } - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; @@ -438,7 +438,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + if (i >= 2) { data11 = *(ao2 + 2); data12 = *(ao2 + 3); @@ -454,7 +454,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data23 = *(ao3 + 6); data24 = *(ao3 + 7); } - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -517,18 +517,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; b += 8; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -537,7 +537,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; - + ao1 += 2 * lda; ao2 += 2 * lda; b += 8; @@ -546,7 +546,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; @@ -560,10 +560,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -586,19 +586,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X > posY) { ao1 += 2; ao2 += 2; - + b += 4; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -611,7 +611,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifdef UNIT data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data03; @@ -621,7 +621,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -645,18 +645,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (i > 0) { do { - + if (X > posY) { b += 2; ao1 += 2; - } else + } else if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; - ao1 += lda; + ao1 += lda; b += 2; } else { @@ -666,7 +666,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; #endif diff --git a/kernel/generic/ztrmm_ltcopy_8.c b/kernel/generic/ztrmm_ltcopy_8.c index e25d9221e..09cb8037e 100644 --- a/kernel/generic/ztrmm_ltcopy_8.c +++ b/kernel/generic/ztrmm_ltcopy_8.c @@ -86,11 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += 16; a08 += 16; b += 128; - } else + } else if (X < posY) { for (ii = 0; ii < 8; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; b += 16; } @@ -120,7 +120,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a06 += 8 * lda; a07 += 8 * lda; a08 += 8 * lda; - + } else { #ifdef UNIT b[ 0] = ONE; @@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + b[ 16] = ZERO; b[ 17] = ZERO; #ifdef UNIT @@ -313,7 +313,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i --; } while (i > 0); } - + i = (m & 7); if (i > 0) { if (X > posY) { @@ -326,7 +326,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += 2 * i; a08 += 2 * i; b += 16 * i; - } else + } else if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); @@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -346,7 +346,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; a02 += lda; a03 += lda; @@ -371,7 +371,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a02 + 5); b[ 6] = *(a02 + 6); b[ 7] = *(a02 + 7); - + b[ 8] = *(a02 + 8); b[ 9] = *(a02 + 9); b[10] = *(a02 + 10); @@ -422,7 +422,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif b[ 6] = *(a03 + 6); b[ 7] = *(a03 + 7); - + b[ 8] = *(a03 + 8); b[ 9] = *(a03 + 9); b[10] = *(a03 + 10); @@ -448,7 +448,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = *(a04 + 6); b[ 7] = *(a04 + 7); #endif - + b[ 8] = *(a04 + 8); b[ 9] = *(a04 + 9); b[10] = *(a04 + 10); @@ -469,7 +469,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + #ifdef UNIT b[ 8] = ONE; b[ 9] = ZERO; @@ -495,7 +495,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT @@ -521,7 +521,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ZERO; @@ -537,7 +537,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = *(a07 + 15); b += 16; } - } + } } posY += 8; @@ -548,7 +548,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4){ X = posX; - + if (posX <= posY) { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; @@ -560,7 +560,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 = a + posX * 2 + (posY + 2) * lda; a04 = a + posX * 2 + (posY + 3) * lda; } - + i = (m >> 2); if (i > 0) { do { @@ -570,7 +570,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += 8; a04 += 8; b += 32; - } else + } else if (X < posY) { for (ii = 0; ii < 4; ii++){ b[ 0] = *(a01 + 0); @@ -581,7 +581,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + a01 += lda; b += 8; } @@ -603,7 +603,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT @@ -631,7 +631,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif b[ 22] = *(a03 + 6); b[ 23] = *(a03 + 7); - + b[ 24] = ZERO; b[ 25] = ZERO; b[ 26] = ZERO; @@ -645,19 +645,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 30] = *(a04 + 6); b[ 31] = *(a04 + 7); #endif - + a01 += 8; a02 += 8; a03 += 8; a04 += 8; b += 32; } - + X += 4; i --; } while (i > 0); } - + i = (m & 3); if (i > 0) { if (X > posY) { @@ -666,7 +666,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += 2 * i; a04 += 2 * i; b += 8 * i; - } else + } else if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(a01 + 0); @@ -677,7 +677,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + a01 += lda; a02 += lda; a03 += lda; @@ -740,7 +740,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 2){ X = posX; - + if (posX <= posY) { a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; @@ -748,7 +748,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; } - + i = (m >> 1); if (i > 0) { do { @@ -756,7 +756,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += 4; a02 += 4; b += 8; - } else + } else if (X < posY) { b[0] = *(a01 + 0); b[1] = *(a01 + 1); @@ -779,7 +779,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif b[2] = *(a01 + 2); b[3] = *(a01 + 3); - + b[4] = ZERO; b[5] = ZERO; #ifdef UNIT @@ -788,30 +788,30 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else b[6] = *(a02 + 2); b[7] = *(a02 + 3); -#endif +#endif a01 += 4; a02 += 4; b += 8; } - + X += 2; i --; } while (i > 0); } - + i = (m & 1); if (i > 0) { if (X > posY) { a01 += 2; a02 += 2; b += 4; - } else + } else if (X < posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); b[ 3] = *(a01 + 3); - + a01 += lda; a02 += lda; b += 4; @@ -833,17 +833,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1){ X = posX; - + if (posX <= posY) { a01 = a + posY * 2 + (posX + 0) * lda; } else { a01 = a + posX * 2 + (posY + 0) * lda; } - + i = m; if (i > 0) { do { - + if (X > posY) { a01 += 2; b += 2; @@ -864,7 +864,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += 2; b += 2; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/ztrmm_uncopy_1.c b/kernel/generic/ztrmm_uncopy_1.c index 595f00955..2782cdd05 100644 --- a/kernel/generic/ztrmm_uncopy_1.c +++ b/kernel/generic/ztrmm_uncopy_1.c @@ -63,7 +63,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = m; if (i > 0) { do { - + if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -74,7 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; b += 2; - } else + } else if (X > posY) { ao1 += lda; b += 2; diff --git a/kernel/generic/ztrmm_uncopy_2.c b/kernel/generic/ztrmm_uncopy_2.c index 6beddf5b9..c2521d3c3 100644 --- a/kernel/generic/ztrmm_uncopy_2.c +++ b/kernel/generic/ztrmm_uncopy_2.c @@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao2 + 1); data07 = *(ao2 + 2); data08 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data05; @@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; b += 8; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -141,22 +141,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X > posY) { ao1 += lda; b += 4; @@ -174,7 +174,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao2 + 0); data04 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -191,17 +191,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1){ X = posX; - + if (posX <= posY) { ao1 = a + posX * 2 + (posY + 0) * lda; } else { ao1 = a + posY * 2 + (posX + 0) * lda; } - + i = m; if (m > 0) { do { - + if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -228,7 +228,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 2; ao1 += lda; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/ztrmm_uncopy_4.c b/kernel/generic/ztrmm_uncopy_4.c index f885b0dc2..249faac1d 100644 --- a/kernel/generic/ztrmm_uncopy_4.c +++ b/kernel/generic/ztrmm_uncopy_4.c @@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + b[ 8] = data03; b[ 9] = data04; b[10] = data11; @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data20; b[14] = data27; b[15] = data28; - + b[16] = data05; b[17] = data06; b[18] = data13; @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data29; b[23] = data30; - + b[24] = data07; b[25] = data08; b[26] = data15; @@ -150,7 +150,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao3 += 8; ao4 += 8; b += 32; - } else + } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -161,19 +161,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data09; @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = ONE; @@ -191,7 +191,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data20; b[14] = data27; b[15] = data28; - + b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; @@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = ZERO; b[22] = data29; b[23] = data30; - + b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; @@ -212,19 +212,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -233,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -242,7 +242,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + b[ 8] = ZERO; b[ 9] = ZERO; b[10] = data11; @@ -251,7 +251,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data20; b[14] = data27; b[15] = data28; - + b[16] = ZERO; b[17] = ZERO; b[18] = ZERO; @@ -260,7 +260,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data29; b[23] = data30; - + b[24] = ZERO; b[25] = ZERO; b[26] = ZERO; @@ -274,7 +274,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 8; ao3 += 8; ao4 += 8; - + b += 32; } @@ -285,7 +285,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { if (m & 2) { @@ -293,22 +293,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + b[ 8] = data03; b[ 9] = data04; b[10] = data11; @@ -326,25 +326,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data20; b[14] = data27; b[15] = data28; - + ao1 += 4; ao2 += 4; ao3 += 4; ao4 += 4; b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data25 = *(ao4 + 0); data26 = *(ao4 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -353,27 +353,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data18; b[ 6] = data25; b[ 7] = data26; - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } - - } else + + } else if (X > posY) { if (m & 2) { ao1 += 2 * lda; ao2 += 2 * lda; b += 16; } - + if (m & 1) { ao1 += lda; b += 8; } - + } else { #ifdef UNIT @@ -405,7 +405,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = data25; b[ 7] = data26; b += 8; - + if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -465,7 +465,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = data25; b[ 7] = data26; b += 8; - + if (i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -517,12 +517,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -536,7 +536,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; b += 8; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -558,12 +558,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -585,13 +585,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X < posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; @@ -599,7 +599,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -608,7 +608,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = data09; @@ -618,7 +618,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data02 = *(ao1 + 1); data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data09; diff --git a/kernel/generic/ztrmm_uncopy_8.c b/kernel/generic/ztrmm_uncopy_8.c index c02c1dedf..faadd2196 100644 --- a/kernel/generic/ztrmm_uncopy_8.c +++ b/kernel/generic/ztrmm_uncopy_8.c @@ -72,14 +72,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao7 = a + posY * 2 + (posX + 6) * lda; ao8 = a + posY * 2 + (posX + 7) * lda; } - + i = (m >> 3); if (i > 0) { do { if (X < posY) { for (ii = 0; ii < 8; ii++){ - + b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); @@ -88,7 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); @@ -97,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao8 += 2; b += 16; } - } else + } else if (X > posY) { ao1 += 8 * lda; ao2 += 8 * lda; @@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao6 += 8 * lda; ao7 += 8 * lda; ao8 += 8 * lda; - + b += 128; } else { @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); @@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); - + b[ 16] = ZERO; b[ 17] = ZERO; #ifdef UNIT @@ -298,7 +298,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[126] = *(ao8 + 14); b[127] = *(ao8 + 15); #endif - + ao1 += 8 * lda; ao2 += 8 * lda; ao3 += 8 * lda; @@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X < posY) { for (ii = 0; ii < i; ii++){ b[ 0] = *(ao1 + 0); @@ -328,7 +328,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + b[ 8] = *(ao5 + 0); b[ 9] = *(ao5 + 1); b[ 10] = *(ao6 + 0); @@ -337,7 +337,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(ao7 + 1); b[ 14] = *(ao8 + 0); b[ 15] = *(ao8 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; @@ -348,7 +348,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao8 += 2; b += 16; } - } else + } else if (X > posY) { ao1 += i * lda; ao2 += i * lda; @@ -382,7 +382,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[14] = *(ao8 + 0); b[15] = *(ao8 + 1); b += 16; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = *(ao8 + 3); b += 16; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -432,8 +432,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = *(ao8 + 5); b += 16; } - - if (i >= 4) { + + if (i >= 4) { b[ 0] = ZERO; b[ 1] = ZERO; b[ 2] = ZERO; @@ -482,7 +482,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = *(ao8 + 9); b += 16; } - + if (i >= 6) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -561,7 +561,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { for (ii = 0; ii < 4; ii++){ - + b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); b[ 2] = *(ao2 + 0); @@ -570,14 +570,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } - } else + } else if (X > posY) { ao1 += 4 * lda; ao2 += 4 * lda; @@ -598,7 +598,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + b[ 8] = ZERO; b[ 9] = ZERO; #ifdef UNIT @@ -645,7 +645,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 32; } @@ -656,7 +656,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { for (ii = 0; ii < i; ii++){ @@ -668,14 +668,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(ao3 + 1); b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); - + ao1 += 2; ao2 += 2; ao3 += 2; ao4 += 2; b += 8; } - } else + } else if (X > posY) { ao1 += i * lda; ao2 += i * lda; @@ -697,7 +697,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = *(ao4 + 0); b[ 7] = *(ao4 + 1); b += 8; - + if(i >= 2) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -714,7 +714,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = *(ao4 + 3); b += 8; } - + if (i >= 3) { b[ 0] = ZERO; b[ 1] = ZERO; @@ -764,7 +764,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 4; ao2 += 4; b += 8; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -779,7 +779,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif b[ 2] = *(ao2 + 0); b[ 3] = *(ao2 + 1); - + b[ 4] = ZERO; b[ 5] = ZERO; #ifdef UNIT @@ -801,7 +801,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } if (m & 1) { - + if (X < posY) { b[ 0] = *(ao1 + 0); b[ 1] = *(ao1 + 1); @@ -810,7 +810,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X > posY) { ao1 += 2 * lda; ao2 += 2 * lda; @@ -850,7 +850,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 1] = *(ao1 + 1); ao1 += 2; b += 2; - } else + } else if (X > posY) { ao1 += lda; b += 2; diff --git a/kernel/generic/ztrmm_utcopy_1.c b/kernel/generic/ztrmm_utcopy_1.c index d4406c980..2746c5f5c 100644 --- a/kernel/generic/ztrmm_utcopy_1.c +++ b/kernel/generic/ztrmm_utcopy_1.c @@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { ao1 += 2; b += 2; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 2; } else { -#ifdef UNIT +#ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; #else diff --git a/kernel/generic/ztrmm_utcopy_2.c b/kernel/generic/ztrmm_utcopy_2.c index c71a55c28..840821e16 100644 --- a/kernel/generic/ztrmm_utcopy_2.c +++ b/kernel/generic/ztrmm_utcopy_2.c @@ -72,18 +72,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; b += 8; - } else + } else if (X > posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); - + data5 = *(ao2 + 0); data6 = *(ao2 + 1); data7 = *(ao2 + 2); data8 = *(ao2 + 3); - + b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; @@ -92,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data6; b[ 6] = data7; b[ 7] = data8; - + ao1 += 2 * lda; ao2 += 2 * lda; b += 8; @@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifdef UNIT data5 = *(ao2 + 0); data6 = *(ao2 + 1); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -113,12 +113,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); - + data5 = *(ao2 + 0); data6 = *(ao2 + 1); data7 = *(ao2 + 2); data8 = *(ao2 + 3); - + b[ 0] = data1; b[ 1] = data2; b[ 2] = ZERO; @@ -128,10 +128,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = data7; b[ 7] = data8; #endif - + ao1 += 2 * lda; ao2 += 2 * lda; - + b += 8; } @@ -145,21 +145,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao1 += 2; ao2 += 2; b += 4; - } else + } else if (X > posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); data3 = *(ao1 + 2); data4 = *(ao1 + 3); - + b[ 0] = data1; b[ 1] = data2; b[ 2] = data3; b[ 3] = data4; - + ao1 += lda; b += 4; - + } else { #ifdef UNIT data5 = *(ao2 + 0); @@ -208,10 +208,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X > posY) { data1 = *(ao1 + 0); data2 = *(ao1 + 1); - + b[ 0] = data1; b[ 1] = data2; - + ao1 += lda; b += 2; } else { @@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data1 = *(ao1 + 0); data2 = *(ao1 + 1); - + b[ 0] = data1; b[ 1] = data2; -#endif +#endif ao1 += lda; b += 2; } - + X += 1; i --; } while (i > 0); diff --git a/kernel/generic/ztrmm_utcopy_4.c b/kernel/generic/ztrmm_utcopy_4.c index cda62bc3b..9a5c8c362 100644 --- a/kernel/generic/ztrmm_utcopy_4.c +++ b/kernel/generic/ztrmm_utcopy_4.c @@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 8; b += 32; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -90,7 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data22 = *(ao3 + 5); data23 = *(ao3 + 6); data24 = *(ao3 + 7); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -117,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -126,7 +126,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + b[16] = data17; b[17] = data18; b[18] = data19; @@ -144,7 +144,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = data23; b[23] = data24; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -153,7 +153,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[29] = data30; b[30] = data31; b[31] = data32; - + ao1 += 4 * lda; ao2 += 4 * lda; ao3 += 4 * lda; @@ -162,22 +162,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } else { -#ifdef UNIT +#ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); data28 = *(ao4 + 3); data29 = *(ao4 + 4); data30 = *(ao4 + 5); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -186,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = data10; b[10] = ONE; @@ -195,7 +195,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; - + b[16] = data17; b[17] = data18; b[18] = data19; @@ -204,7 +204,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = ZERO; b[22] = ZERO; b[23] = ZERO; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -221,14 +221,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + data17 = *(ao3 + 0); data18 = *(ao3 + 1); data19 = *(ao3 + 2); data20 = *(ao3 + 3); data21 = *(ao3 + 4); data22 = *(ao3 + 5); - + data25 = *(ao4 + 0); data26 = *(ao4 + 1); data27 = *(ao4 + 2); @@ -237,7 +237,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data30 = *(ao4 + 5); data31 = *(ao4 + 6); data32 = *(ao4 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; @@ -246,7 +246,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -255,7 +255,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = ZERO; b[14] = ZERO; b[15] = ZERO; - + b[16] = data17; b[17] = data18; b[18] = data19; @@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[21] = data22; b[22] = ZERO; b[23] = ZERO; - + b[24] = data25; b[25] = data26; b[26] = data27; @@ -279,7 +279,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4 * lda; ao3 += 4 * lda; ao4 += 4 * lda; - + b += 32; } @@ -290,7 +290,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { if (m & 2) { @@ -300,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 4; b += 16; } - + if (m & 1) { ao1 += 2; ao2 += 2; @@ -308,8 +308,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao4 += 2; b += 8; } - - } else + + } else if (X > posY) { if (m & 2) { data01 = *(ao1 + 0); @@ -320,7 +320,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); @@ -329,7 +329,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data14 = *(ao2 + 5); data15 = *(ao2 + 6); data16 = *(ao2 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -338,7 +338,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + b[ 8] = data09; b[ 9] = data10; b[10] = data11; @@ -347,12 +347,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[13] = data14; b[14] = data15; b[15] = data16; - + ao1 += 2 * lda; ao2 += 2 * lda; b += 16; } - + if (m & 1) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); @@ -362,7 +362,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data06 = *(ao1 + 5); data07 = *(ao1 + 6); data08 = *(ao1 + 7); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -371,19 +371,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data06; b[ 6] = data07; b[ 7] = data08; - + ao1 += lda; b += 8; } - + } else { -#ifdef UNIT +#ifdef UNIT if (i >= 2) { data09 = *(ao2 + 0); data10 = *(ao2 + 1); } - + if (i >= 3) { data17 = *(ao3 + 0); data18 = *(ao3 + 1); @@ -434,7 +434,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data11 = *(ao2 + 2); data12 = *(ao2 + 3); } - + if (i >= 3) { data17 = *(ao3 + 0); data18 = *(ao3 + 1); @@ -505,18 +505,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON ao2 += 4; b += 8; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + data09 = *(ao2 + 0); data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; @@ -525,7 +525,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = data10; b[ 6] = data11; b[ 7] = data12; - + ao1 += 2 * lda; ao2 += 2 * lda; b += 8; @@ -534,7 +534,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #ifdef UNIT data09 = *(ao2 + 0); data10 = *(ao2 + 1); - + b[ 0] = ONE; b[ 1] = ZERO; b[ 2] = ZERO; @@ -551,7 +551,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON data10 = *(ao2 + 1); data11 = *(ao2 + 2); data12 = *(ao2 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; @@ -563,7 +563,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif ao1 += 2 * lda; ao2 += 2 * lda; - + b += 8; } @@ -574,21 +574,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X < posY) { b += 4; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); data03 = *(ao1 + 2); data04 = *(ao1 + 3); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = data03; b[ 3] = data04; - + b += 4; } else { #ifdef UNIT @@ -599,7 +599,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #else data01 = *(ao1 + 0); data02 = *(ao1 + 1); - + b[ 0] = data01; b[ 1] = data02; b[ 2] = ZERO; @@ -628,7 +628,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (X < posY) { b += 2; ao1 += 2; - } else + } else if (X > posY) { data01 = *(ao1 + 0); data02 = *(ao1 + 1); diff --git a/kernel/generic/ztrmm_utcopy_8.c b/kernel/generic/ztrmm_utcopy_8.c index 08dd80ca2..6c0448443 100644 --- a/kernel/generic/ztrmm_utcopy_8.c +++ b/kernel/generic/ztrmm_utcopy_8.c @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 = a + posY * 2 + (posX + 6) * lda; a08 = a + posY * 2 + (posX + 7) * lda; } - + i = (m >> 3); if (i > 0) { do { @@ -87,11 +87,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += 16; a08 += 16; b += 128; - } else + } else if (X > posY) { for (ii = 0; ii < 8; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -109,7 +109,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; b += 16; } @@ -136,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = ZERO; b[ 9] = ZERO; b[ 10] = ZERO; @@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; - + b[ 16] = *(a02 + 0); b[ 17] = *(a02 + 1); #ifdef UNIT @@ -317,7 +317,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 7); if (i) { - + if (X < posY) { a01 += 2 * i; @@ -329,11 +329,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a07 += 2 * i; a08 += 2 * i; b += 16 * i; - } else + } else if (X > posY) { for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -342,7 +342,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + b[ 8] = *(a01 + 8); b[ 9] = *(a01 + 9); b[ 10] = *(a01 + 10); @@ -351,7 +351,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = *(a01 + 13); b[ 14] = *(a01 + 14); b[ 15] = *(a01 + 15); - + a01 += lda; a02 += lda; a03 += lda; @@ -386,7 +386,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[14] = ZERO; b[15] = ZERO; b += 16; - + if(i >= 2) { b[ 0] = *(a02 + 0); b[ 1] = *(a02 + 1); @@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = ZERO; b += 16; } - + if (i >= 3) { b[ 0] = *(a03 + 0); b[ 1] = *(a03 + 1); @@ -436,8 +436,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = ZERO; b += 16; } - - if (i >= 4) { + + if (i >= 4) { b[ 0] = *(a04 + 0); b[ 1] = *(a04 + 1); b[ 2] = *(a04 + 2); @@ -486,7 +486,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[15] = ZERO; b += 16; } - + if (i >= 6) { b[ 0] = *(a06 + 0); b[ 1] = *(a06 + 1); @@ -547,7 +547,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 4){ X = posX; - + if (posX <= posY) { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; @@ -559,7 +559,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 = a + posY * 2 + (posX + 2) * lda; a04 = a + posY * 2 + (posX + 3) * lda; } - + i = (m >> 2); if (i > 0) { do { @@ -569,11 +569,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a03 += 8; a04 += 8; b += 32; - } else + } else if (X > posY) { - + for (ii = 0; ii < 4; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -582,11 +582,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + a01 += lda; b += 8; } - + a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; @@ -605,7 +605,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = ZERO; b[ 6] = ZERO; b[ 7] = ZERO; - + b[ 8] = *(a02 + 0); b[ 9] = *(a02 + 1); #ifdef UNIT @@ -619,7 +619,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 13] = ZERO; b[ 14] = ZERO; b[ 15] = ZERO; - + b[ 16] = *(a03 + 0); b[ 17] = *(a03 + 1); b[ 18] = *(a03 + 2); @@ -633,7 +633,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON #endif b[ 22] = ZERO; b[ 23] = ZERO; - + b[ 24] = *(a04 + 0); b[ 25] = *(a04 + 1); b[ 26] = *(a04 + 2); @@ -647,14 +647,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 30] = *(a04 + 6); b[ 31] = *(a04 + 7); #endif - + a01 += 4 * lda; a02 += 4 * lda; a03 += 4 * lda; a04 += 4 * lda; b += 32; } - + X += 4; i --; } while (i > 0); @@ -662,18 +662,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 3); if (i) { - + if (X < posY) { a01 += 2 * i; a02 += 2 * i; a03 += 2 * i; a04 += 2 * i; b += 8 * i; - } else + } else if (X > posY) { - + for (ii = 0; ii < i; ii++){ - + b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); b[ 2] = *(a01 + 2); @@ -682,7 +682,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a01 + 5); b[ 6] = *(a01 + 6); b[ 7] = *(a01 + 7); - + a01 += lda; a02 += lda; a03 += lda; @@ -690,7 +690,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += 8; } } else { - + #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; @@ -705,7 +705,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = ZERO; b[ 7] = ZERO; b += 8; - + if(i >= 2) { b[ 0] = *(a02 + 0); b[ 1] = *(a02 + 1); @@ -722,7 +722,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 7] = ZERO; b += 8; } - + if (i >= 3) { b[ 0] = *(a03 + 0); b[ 1] = *(a03 + 1); @@ -741,14 +741,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } } - + posY += 4; } if (n & 2){ X = posX; - + if (posX <= posY) { a01 = a + posX * 2 + (posY + 0) * lda; a02 = a + posX * 2 + (posY + 1) * lda; @@ -756,7 +756,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 = a + posY * 2 + (posX + 0) * lda; a02 = a + posY * 2 + (posX + 1) * lda; } - + i = (m >> 1); if (i > 0) { do { @@ -764,7 +764,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON a01 += 4; a02 += 4; b += 8; - } else + } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -774,12 +774,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 5] = *(a02 + 1); b[ 6] = *(a02 + 2); b[ 7] = *(a02 + 3); - + a01 += 2 * lda; a02 += 2 * lda; b += 8; } else { - + #ifdef UNIT b[ 0] = ONE; b[ 1] = ZERO; @@ -799,12 +799,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[ 6] = *(a02 + 2); b[ 7] = *(a02 + 3); #endif - + a01 += 2 * lda; a02 += 2 * lda; b += 8; } - + X += 2; i --; } while (i > 0); @@ -812,10 +812,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i = (m & 1); if (i) { - + if (X < posY) { b += 4; - } else + } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -840,20 +840,20 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON if (n & 1){ X = posX; - + if (posX <= posY) { a01 = a + posX * 2 + (posY + 0) * lda; } else { a01 = a + posY * 2 + (posX + 0) * lda; } - + i = m; if (m > 0) { do { if (X < posY) { a01 += 2; b += 2; - } else + } else if (X > posY) { b[ 0] = *(a01 + 0); b[ 1] = *(a01 + 1); @@ -875,6 +875,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } while (i > 0); } } - + return 0; } diff --git a/kernel/generic/ztrmmkernel_2x2.c b/kernel/generic/ztrmmkernel_2x2.c index b7c6539c2..ecb2a97cd 100644 --- a/kernel/generic/ztrmmkernel_2x2.c +++ b/kernel/generic/ztrmmkernel_2x2.c @@ -16,7 +16,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b #if defined(TRMMKERNEL) && !defined(LEFT) off = -offset; #endif - for (j=0; j= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); @@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); @@ -133,7 +133,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); @@ -175,7 +175,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * lda + 0); @@ -209,7 +209,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); diff --git a/kernel/generic/ztrsm_ltcopy_1.c b/kernel/generic/ztrsm_ltcopy_1.c index ef495327b..af4ac127d 100644 --- a/kernel/generic/ztrsm_ltcopy_1.c +++ b/kernel/generic/ztrsm_ltcopy_1.c @@ -74,7 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += lda; b += 2; diff --git a/kernel/generic/ztrsm_ltcopy_2.c b/kernel/generic/ztrsm_ltcopy_2.c index bcc2bbc91..21bd0fa1f 100644 --- a/kernel/generic/ztrsm_ltcopy_2.c +++ b/kernel/generic/ztrsm_ltcopy_2.c @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += 1 * lda; b += 2; diff --git a/kernel/generic/ztrsm_ltcopy_4.c b/kernel/generic/ztrsm_ltcopy_4.c index 8c4e66b7f..c1152710e 100644 --- a/kernel/generic/ztrsm_ltcopy_4.c +++ b/kernel/generic/ztrsm_ltcopy_4.c @@ -198,7 +198,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 30) = data31; *(b + 31) = data32; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -284,7 +284,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 2 * lda; a2 += 2 * lda; b += 16; @@ -334,7 +334,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += lda; b += 8; ii += 1; @@ -394,7 +394,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data11; *(b + 7) = data12; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -429,7 +429,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += lda; b += 4; ii += 1; @@ -463,7 +463,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += lda; b += 2; diff --git a/kernel/generic/ztrsm_ltcopy_8.c b/kernel/generic/ztrsm_ltcopy_8.c index 899c9ab30..83f28113d 100644 --- a/kernel/generic/ztrsm_ltcopy_8.c +++ b/kernel/generic/ztrsm_ltcopy_8.c @@ -45,7 +45,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *a1; FLOAT data1, data2; - + lda *= 2; jj = offset; @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 8)) { data1 = *(a1 + (ii - jj) * 2 + 0); @@ -71,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -108,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { data1 = *(a1 + (ii - jj) * 2 + 0); @@ -122,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -150,7 +150,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { data1 = *(a1 + (ii - jj) * 2 + 0); @@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -187,14 +187,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { data1 = *(a1 + (ii - jj) * 2 + 0); data2 = *(a1 + (ii - jj) * 2 + 1); compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); diff --git a/kernel/generic/ztrsm_uncopy_1.c b/kernel/generic/ztrsm_uncopy_1.c index 0891300d1..dc9157bdc 100644 --- a/kernel/generic/ztrsm_uncopy_1.c +++ b/kernel/generic/ztrsm_uncopy_1.c @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += 2; b += 2; diff --git a/kernel/generic/ztrsm_uncopy_2.c b/kernel/generic/ztrsm_uncopy_2.c index 45c209363..fecab88e7 100644 --- a/kernel/generic/ztrsm_uncopy_2.c +++ b/kernel/generic/ztrsm_uncopy_2.c @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 4; a2 += 4; b += 8; @@ -164,7 +164,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1+= 2; b += 2; i --; diff --git a/kernel/generic/ztrsm_uncopy_4.c b/kernel/generic/ztrsm_uncopy_4.c index 9cbc6c729..9d0e2438d 100644 --- a/kernel/generic/ztrsm_uncopy_4.c +++ b/kernel/generic/ztrsm_uncopy_4.c @@ -197,7 +197,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 30) = data31; *(b + 31) = data32; } - + a1 += 8; a2 += 8; a3 += 8; @@ -287,7 +287,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data27; *(b + 15) = data28; } - + a1 += 4; a2 += 4; a3 += 4; @@ -343,7 +343,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data25; *(b + 7) = data26; } - + a1 += 2; a2 += 2; a3 += 2; @@ -407,7 +407,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data11; *(b + 7) = data12; } - + a1 += 4; a2 += 4; b += 8; @@ -443,7 +443,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data09; *(b + 3) = data10; } - + a1 += 2; a2 += 2; b += 4; @@ -480,7 +480,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += 2; b += 2; diff --git a/kernel/generic/ztrsm_uncopy_8.c b/kernel/generic/ztrsm_uncopy_8.c index 2ce1c72ca..453a6c0a9 100644 --- a/kernel/generic/ztrsm_uncopy_8.c +++ b/kernel/generic/ztrsm_uncopy_8.c @@ -45,7 +45,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; FLOAT data1, data2; - + lda *= 2; jj = offset; @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 8)) { data1 = *(a1 + (ii - jj) * lda + 0); @@ -78,7 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -124,7 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); @@ -136,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -167,7 +167,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); @@ -178,7 +178,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { data1 = *(a1 + (ii - jj) * lda + 0); data2 = *(a1 + (ii - jj) * lda + 1); @@ -212,7 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + k * 2 + 1) = *(a1 + k * lda + 1); } } - + if (ii - jj < 0) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 42ecc471b..08f85e891 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -73,7 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += lda; b += 2; diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index fd7affb3f..387bb2532 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -99,7 +99,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += 1 * lda; b += 2; diff --git a/kernel/generic/ztrsm_utcopy_4.c b/kernel/generic/ztrsm_utcopy_4.c index fd3483c10..f19badd33 100644 --- a/kernel/generic/ztrsm_utcopy_4.c +++ b/kernel/generic/ztrsm_utcopy_4.c @@ -196,7 +196,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 30) = data31; *(b + 31) = data32; } - + a1 += 4 * lda; a2 += 4 * lda; a3 += 4 * lda; @@ -264,7 +264,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 14) = data15; *(b + 15) = data16; } - + a1 += 2 * lda; a2 += 2 * lda; b += 16; @@ -302,7 +302,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data07; *(b + 7) = data08; } - + a1 += lda; b += 8; @@ -363,7 +363,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 6) = data11; *(b + 7) = data12; } - + a1 += 2 * lda; a2 += 2 * lda; b += 8; @@ -393,7 +393,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 2) = data03; *(b + 3) = data04; } - + a1 += lda; b += 4; @@ -428,7 +428,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *(b + 0) = data01; *(b + 1) = data02; } - + a1 += lda; b += 2; diff --git a/kernel/generic/ztrsm_utcopy_8.c b/kernel/generic/ztrsm_utcopy_8.c index 52c7ed5a3..be270f1cc 100644 --- a/kernel/generic/ztrsm_utcopy_8.c +++ b/kernel/generic/ztrsm_utcopy_8.c @@ -57,7 +57,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 8)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); @@ -69,7 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 8) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -106,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 4)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); @@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 4) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -146,7 +146,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 2)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); @@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 2) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); @@ -181,7 +181,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii = 0; for (i = 0; i < m; i++) { - + if ((ii >= jj ) && (ii - jj < 1)) { for (k = 0; k < ii - jj; k ++) { *(b + k * 2 + 0) = *(a1 + k * 2 + 0); @@ -193,7 +193,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + (ii - jj) * 2, data1, data2); } - + if (ii - jj >= 1) { *(b + 0) = *(a1 + 0); *(b + 1) = *(a1 + 1); diff --git a/kernel/ia64/amax.S b/kernel/ia64/amax.S index fae96f12b..72eae44dc 100644 --- a/kernel/ia64/amax.S +++ b/kernel/ia64/amax.S @@ -314,18 +314,18 @@ ;; { .mmf (p13) LDFD f42 = [DX], INCX - nop.m 0 + nop.m 0 (p12) FMAX DMAX1 = f32, DMAX1 } { .mmf (p15) LDFD f46 = [X3], INCX - nop.m 0 + nop.m 0 (p12) FMAX DMAX5 = f36, DMAX5 } ;; { .mmf (p13) LDFD f43 = [DX], INCX - nop.m 0 + nop.m 0 (p12) FMAX DMAX2 = f33, DMAX2 } (p12) FMAX DMAX6 = f37, DMAX6 @@ -343,7 +343,7 @@ (p13) FMAX DMAX4 = f43, DMAX4 ;; .align 32 - + .L99: { .mfi nop.m 0 diff --git a/kernel/ia64/asum.S b/kernel/ia64/asum.S index 6114f57ed..55c68922b 100644 --- a/kernel/ia64/asum.S +++ b/kernel/ia64/asum.S @@ -54,7 +54,7 @@ #define COMPADD 1 #define STRIDE SIZE #endif - + #define PRE1 r2 #define I r17 @@ -128,7 +128,7 @@ mov f11 = f0 shl INCX = INCX, BASE_SHIFT + COMPADD } - ;; + ;; { .mmi #ifdef XDOUBLE shladd INCX16 = INCX, (3 - COMPADD), r0 @@ -260,7 +260,7 @@ } ;; { .mmf - (p16) LDFD f71 = [X], INCX + (p16) LDFD f71 = [X], INCX (p18) fabs f73 = f73 } { .mfb @@ -365,7 +365,7 @@ #ifndef COMPLEX (p15) FADD f10 = f10, f46 #endif - ;; + ;; .align 32 .L998: diff --git a/kernel/ia64/caxpy.S b/kernel/ia64/caxpy.S index 0a28ebe36..1a994e798 100644 --- a/kernel/ia64/caxpy.S +++ b/kernel/ia64/caxpy.S @@ -504,10 +504,10 @@ } ;; (p14) STFD [YY1] = f90, 1 * SIZE - ;; + ;; (p14) STFD [YY1] = f91 (p14) add YY1 = YY1, INCYM1 - ;; + ;; (p15) STFD [YY1] = f92, 1 * SIZE ;; { .mmb diff --git a/kernel/ia64/copy.S b/kernel/ia64/copy.S index b5d7f482b..9e7ef3265 100644 --- a/kernel/ia64/copy.S +++ b/kernel/ia64/copy.S @@ -94,7 +94,7 @@ mov PR = pr } { .mmi - mov YY = Y1 + mov YY = Y1 (p7) adds N = -1, N (p7) add Y1 = Y1, INCY } @@ -600,7 +600,7 @@ ;; /* INCX != 1 */ -.L100: +.L100: { .mmi shladd INCX16 = INCX, 4, r0 shladd INCY16 = INCY, 4, r0 diff --git a/kernel/ia64/daxpy.S b/kernel/ia64/daxpy.S index b971df6f0..72b9afa1a 100644 --- a/kernel/ia64/daxpy.S +++ b/kernel/ia64/daxpy.S @@ -62,7 +62,7 @@ #define YY r27 #define PR r30 #define ARLC r31 - + #define ALPHA f8 PROLOGUE diff --git a/kernel/ia64/ddot.S b/kernel/ia64/ddot.S index 082c303d8..6654f72c9 100644 --- a/kernel/ia64/ddot.S +++ b/kernel/ia64/ddot.S @@ -344,7 +344,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -516,7 +516,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -748,7 +748,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -920,7 +920,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -1156,7 +1156,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 diff --git a/kernel/ia64/gemm_beta.S b/kernel/ia64/gemm_beta.S index ceeca4acb..b4cf816bf 100644 --- a/kernel/ia64/gemm_beta.S +++ b/kernel/ia64/gemm_beta.S @@ -81,7 +81,7 @@ { .mfb cmp.ge p6, p0 = 0, N fcmp.eq p0, p15 = BETA, f0 - (p6) br.ret.sptk.many b0 + (p6) br.ret.sptk.many b0 } ;; .body @@ -197,7 +197,7 @@ { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE - (p12) adds CO3 = 8 * SIZE, CO3 + (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi @@ -397,7 +397,7 @@ { .mmi (p12) LDFD f34 = [CO1], 1 * SIZE (p12) LDFD f38 = [CO2], 1 * SIZE - (p12) adds CO3 = 8 * SIZE, CO3 + (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi @@ -446,7 +446,7 @@ (p13) FMPY f40 = BETA, f40 } { .mmf - (p12) adds DO3 = 8 * SIZE, DO3 + (p12) adds DO3 = 8 * SIZE, DO3 (p14) FMPY f44 = BETA, f44 } ;; @@ -456,7 +456,7 @@ (p13) FMPY f41 = BETA, f41 } { .mmf - (p13) adds DO3 = 4 * SIZE, DO3 + (p13) adds DO3 = 4 * SIZE, DO3 (p14) FMPY f45 = BETA, f45 } ;; diff --git a/kernel/ia64/gemm_kernel.S b/kernel/ia64/gemm_kernel.S index d1d4731dd..c3277a40b 100644 --- a/kernel/ia64/gemm_kernel.S +++ b/kernel/ia64/gemm_kernel.S @@ -119,11 +119,11 @@ stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 mov PR = pr - ;; + ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shladd LDC = LDC, BASE_SHIFT, r0 - ;; + ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 shr J = N, 3 @@ -131,18 +131,18 @@ stf.spill [r8] = f22, 32 stf.spill [r9] = f23, 32 mov AOFFSET = A - ;; + ;; stf.spill [r8] = f24, 32 stf.spill [r9] = f25, 32 cmp.ge p6, p0 = 0, J - ;; + ;; stf.spill [r8] = f26, 32 stf.spill [r9] = f27, 32 shr BB = K, 3 - ;; + ;; stf.spill [r8] = f28, 32 stf.spill [r9] = f29, 32 - ;; + ;; stf.spill [r8] = f30 stf.spill [r9] = f31 #ifndef TRMMKERNEL @@ -182,7 +182,7 @@ nop __LINE__ #endif mov f80 = f0 - } + } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc @@ -546,7 +546,7 @@ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfi FMA f114 = f34, f54, f114 // A3 * B7 @@ -4233,7 +4233,7 @@ #else nop __LINE__ #endif - } + } { .mfi shladd C4 = LDC, 1, C2 mov f73 = f0 @@ -5862,7 +5862,7 @@ ;; { .mfi STFD [C4 ] = f89, 3 * SIZE - mov f89 = f0 + mov f89 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else @@ -8855,7 +8855,7 @@ ;; #endif ;; - { .mii + { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 @@ -8928,13 +8928,13 @@ ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9], 32 mov ar.lc = ARLC diff --git a/kernel/ia64/gemm_ncopy.S b/kernel/ia64/gemm_ncopy.S index ebb80bfaa..aa0d1cb82 100644 --- a/kernel/ia64/gemm_ncopy.S +++ b/kernel/ia64/gemm_ncopy.S @@ -49,7 +49,7 @@ #define LD LDFD #define ST STFD_NTA #endif - + #define J r15 #define PREB r17 #define PREA r18 @@ -82,7 +82,7 @@ .prologue PROFCODE - .body + .body { .mii shladd LDA = LDA, BASE_SHIFT, r0 mov PR = pr diff --git a/kernel/ia64/gemv_n.S b/kernel/ia64/gemv_n.S index 4826bf5b4..972dd0113 100644 --- a/kernel/ia64/gemv_n.S +++ b/kernel/ia64/gemv_n.S @@ -84,13 +84,13 @@ #define AO61 loc13 #define AO71 loc14 #define AO81 loc15 - + #define PREB r8 #define ARLC r29 #define PR r30 #define ARPFS r31 - + #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else @@ -120,17 +120,17 @@ ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 - ;; + ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 - ;; + ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 .body - ;; + ;; ld8 Y = [r14] ld8 INCY = [r15] @@ -3301,15 +3301,15 @@ ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 diff --git a/kernel/ia64/gemv_t.S b/kernel/ia64/gemv_t.S index 6bc579ed5..0dc457835 100644 --- a/kernel/ia64/gemv_t.S +++ b/kernel/ia64/gemv_t.S @@ -80,7 +80,7 @@ #define AO41 loc9 #define AO61 loc10 #define AO81 loc11 - + #define PREB r8 #define WPRE r9 #define OFFSET PREB @@ -89,7 +89,7 @@ #define ARLC r29 #define PR r30 #define ARPFS r31 - + #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else @@ -149,7 +149,7 @@ xmpy.l f10 = f10, f11 } .body - ;; + ;; ;; { .mmi ld8 BUFFER = [r16] @@ -3541,15 +3541,15 @@ ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 diff --git a/kernel/ia64/iamax.S b/kernel/ia64/iamax.S index a091675df..57d34a331 100644 --- a/kernel/ia64/iamax.S +++ b/kernel/ia64/iamax.S @@ -636,4 +636,4 @@ } ;; EPILOGUE - + diff --git a/kernel/ia64/izamax.S b/kernel/ia64/izamax.S index c43bccaf6..6d98ce4e1 100644 --- a/kernel/ia64/izamax.S +++ b/kernel/ia64/izamax.S @@ -60,7 +60,7 @@ #define N r32 #define DX r33 #define INCX r34 - + #define PRE1 r2 #define I r14 @@ -107,7 +107,7 @@ sxt4 N = N sxt4 INCX = INCX } - ;; + ;; #endif #endif @@ -288,7 +288,7 @@ (p16) LDFD f82 = [DX], SIZE (p8 ) mov DMAX1 = DATA6 (p19) fabs f85 = f85 - } + } { .mmf nop.m 0 nop.m 0 @@ -563,7 +563,7 @@ } ;; .align 32 - + .L999: { .mmi setf.d f8 = DMAX1 diff --git a/kernel/ia64/lsame.S b/kernel/ia64/lsame.S index 3f2a7dbe0..26da80e15 100644 --- a/kernel/ia64/lsame.S +++ b/kernel/ia64/lsame.S @@ -58,7 +58,7 @@ ;; cmp4.eq p6, p7 = r15, r14 mov r8 = 1 - ;; + ;; (p7) mov r8 = 0 br.ret.sptk.many b0 diff --git a/kernel/ia64/nrm2.S b/kernel/ia64/nrm2.S index bb88cfb89..52dc3d849 100644 --- a/kernel/ia64/nrm2.S +++ b/kernel/ia64/nrm2.S @@ -153,7 +153,7 @@ .align 32 .L51: - (p16) LDFD f32 = [X], STRIDE + (p16) LDFD f32 = [X], STRIDE (p16) lfetch.nt1 [PRE1], INCX16 (p18) fma.d.s1 f8 = f34, f34, f8 @@ -199,7 +199,7 @@ (p16) LDFD f68 = [X], STRIDE (p18) fma.d.s1 f12 = f70, f70, f12 nop.b 0 - (p16) LDFD f71 = [X2], STRIDE + (p16) LDFD f71 = [X2], STRIDE (p18) fma.d.s1 f13 = f73, f73, f13 nop.b 0 ;; @@ -271,7 +271,7 @@ ;; #ifndef COMPLEX (p15) fma.d.s1 f14 = f46, f46, f14 - ;; + ;; #endif .align 32 diff --git a/kernel/ia64/qaxpy.S b/kernel/ia64/qaxpy.S index 2acb86b73..2cca4921e 100644 --- a/kernel/ia64/qaxpy.S +++ b/kernel/ia64/qaxpy.S @@ -74,7 +74,7 @@ #define PR r30 #define ARLC r31 - + #define ALPHA f8 #define SP r12 @@ -268,7 +268,7 @@ (p16) lfetch.nt1 [PRE1], INCX8 nop __LINE__ (p17) FMA f11 = ALPHA, f42, f90 - } + } ;; { .mmi (p16) LDFD f56 = [X1], INCX4 diff --git a/kernel/ia64/qgemm_kernel.S b/kernel/ia64/qgemm_kernel.S index 3c9fb6980..01209521b 100644 --- a/kernel/ia64/qgemm_kernel.S +++ b/kernel/ia64/qgemm_kernel.S @@ -115,10 +115,10 @@ stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 mov PR = pr - ;; + ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 - ;; + ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 shr J = N, 3 @@ -126,17 +126,17 @@ stf.spill [r8] = f22, 32 stf.spill [r9] = f23, 32 mov AOFFSET = A - ;; + ;; stf.spill [r8] = f24, 32 stf.spill [r9] = f25, 32 cmp.ge p6, p0 = 0, J - ;; + ;; stf.spill [r8] = f26, 32 stf.spill [r9] = f27, 32 - ;; + ;; stf.spill [r8] = f28, 32 stf.spill [r9] = f29, 32 - ;; + ;; stf.spill [r8] = f30 stf.spill [r9] = f31 ld8 C = [r14], 8 @@ -183,7 +183,7 @@ nop __LINE__ #endif mov f80 = f0 - } + } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc @@ -568,7 +568,7 @@ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb nop __LINE__ @@ -4264,7 +4264,7 @@ #else nop __LINE__ #endif - } + } { .mfi shladd C4 = LDC, 1, C2 mov f73 = f0 @@ -5893,7 +5893,7 @@ ;; { .mfi STFD [C4 ] = f89, 3 * SIZE - mov f89 = f0 + mov f89 = f0 #ifdef TRMMKERNEL shladd KK8 = KK, BASE_SHIFT, r0 #else @@ -8890,7 +8890,7 @@ ;; #endif ;; - { .mii + { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 @@ -8963,13 +8963,13 @@ ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9], 32 mov ar.lc = ARLC diff --git a/kernel/ia64/qgemv_n.S b/kernel/ia64/qgemv_n.S index 4eeac126c..228a00cc8 100644 --- a/kernel/ia64/qgemv_n.S +++ b/kernel/ia64/qgemv_n.S @@ -80,7 +80,7 @@ #define ARLC r30 #define PR r31 - + #define LDA7M8 r8 #define PREA r9 #define PREB r10 @@ -114,7 +114,7 @@ adds r15 = 24, SP adds r16 = 32, SP .body - ;; + ;; #ifdef XDOUBLE ld8 X = [r14], 16 @@ -179,10 +179,10 @@ .L11: shladd LDA7M8 = LDA, 3, r0 ;; - sub LDA7M8 = LDA, LDA7M8 + sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 - ;; + ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 1 * SIZE, YY @@ -558,7 +558,7 @@ nop __LINE__ (p17) FMA f122 = ALPHA7, f88, f122 } - ;; + ;; { .mmf (p16) LDFD f84 = [AO5], LDA (p16) LDFD f85 = [AO6], LDA @@ -788,7 +788,7 @@ (p14) FMA f100 = ALPHA7, f84, f100 (p14) FMA f101 = ALPHA7, f85, f101 (p15) FMA f102 = ALPHA7, f86, f102 - ;; + ;; (p13) FMA f16 = ALPHA8, f88, f96 (p13) FMA f17 = ALPHA8, f89, f97 (p13) FMA f18 = ALPHA8, f90, f98 @@ -832,10 +832,10 @@ shladd LDA7M8 = LDA, 2, r0 ;; - sub LDA7M8 = LDA, LDA7M8 + sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 - ;; + ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 2 * SIZE, YY @@ -1123,10 +1123,10 @@ shladd LDA7M8 = LDA, 1, r0 ;; - sub LDA7M8 = LDA, LDA7M8 + sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 - ;; + ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 2 * SIZE, YY @@ -1334,7 +1334,7 @@ (p6) br.cond.dpnt .L990 ;; mov LDA7M8 = 8 * SIZE - ;; + ;; mov YLD1 = YY mov YST1 = YY adds YLD2 = 2 * SIZE, YY diff --git a/kernel/ia64/qgemv_t.S b/kernel/ia64/qgemv_t.S index f3fc693d7..5b27e0996 100644 --- a/kernel/ia64/qgemv_t.S +++ b/kernel/ia64/qgemv_t.S @@ -82,7 +82,7 @@ #define ARLC r30 #define PR r31 - + #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else @@ -150,7 +150,7 @@ adds YY2 = 4 * SIZE, BUFFER ;; shr I = M, 3 - ;; + ;; { .mmi adds I = -1, I cmp.eq p16, p0 = r0, r0 @@ -254,10 +254,10 @@ ;; shladd LDA7M8 = LDA, 3, r0 ;; - sub LDA7M8 = LDA, LDA7M8 + sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 - ;; + ;; mov f8 = f0 mov f9 = f0 mov f10 = f0 @@ -386,8 +386,8 @@ (p16) FMA f14 = f96, f80, f14 (p16) FMA f15 = f96, f88, f15 ;; - (p16) FMA f8 = f97, f33, f8 - (p16) FMA f9 = f97, f41, f9 + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 (p16) FMA f10 = f97, f49, f10 (p16) FMA f11 = f97, f57, f11 (p16) FMA f12 = f97, f65, f12 @@ -404,8 +404,8 @@ (p16) FMA f14 = f98, f82, f14 (p16) FMA f15 = f98, f90, f15 ;; - (p16) FMA f8 = f99, f35, f8 - (p16) FMA f9 = f99, f43, f9 + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 (p16) FMA f10 = f99, f51, f10 (p16) FMA f11 = f99, f59, f11 (p16) FMA f12 = f99, f67, f12 @@ -422,8 +422,8 @@ (p16) FMA f14 = f100, f84, f14 (p16) FMA f15 = f100, f92, f15 ;; - (p16) FMA f8 = f101, f37, f8 - (p16) FMA f9 = f101, f45, f9 + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 (p16) FMA f10 = f101, f53, f10 (p16) FMA f11 = f101, f61, f11 (p16) FMA f12 = f101, f69, f12 @@ -440,8 +440,8 @@ (p16) FMA f14 = f102, f86, f14 (p16) FMA f15 = f102, f94, f15 ;; - (p16) FMA f8 = f103, f39, f8 - (p16) FMA f9 = f103, f47, f9 + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 (p16) FMA f10 = f103, f55, f10 (p16) FMA f11 = f103, f63, f11 (p16) FMA f12 = f103, f71, f12 @@ -563,8 +563,8 @@ (p13) FMA f14 = f96, f80, f14 (p13) FMA f15 = f96, f88, f15 ;; - (p13) FMA f8 = f97, f33, f8 - (p13) FMA f9 = f97, f41, f9 + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 (p13) FMA f10 = f97, f49, f10 (p13) FMA f11 = f97, f57, f11 (p13) FMA f12 = f97, f65, f12 @@ -581,8 +581,8 @@ (p13) FMA f14 = f98, f82, f14 (p13) FMA f15 = f98, f90, f15 ;; - (p13) FMA f8 = f99, f35, f8 - (p13) FMA f9 = f99, f43, f9 + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 (p13) FMA f10 = f99, f51, f10 (p13) FMA f11 = f99, f59, f11 (p13) FMA f12 = f99, f67, f12 @@ -599,8 +599,8 @@ (p14) FMA f14 = f100, f84, f14 (p14) FMA f15 = f100, f92, f15 ;; - (p14) FMA f8 = f101, f37, f8 - (p14) FMA f9 = f101, f45, f9 + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 (p14) FMA f10 = f101, f53, f10 (p14) FMA f11 = f101, f61, f11 (p14) FMA f12 = f101, f69, f12 @@ -690,10 +690,10 @@ ;; shladd LDA7M8 = LDA, 2, r0 ;; - sub LDA7M8 = LDA, LDA7M8 + sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 - ;; + ;; mov f8 = f0 mov f9 = f0 mov f10 = f0 @@ -778,8 +778,8 @@ (p16) FMA f10 = f96, f48, f10 (p16) FMA f11 = f96, f56, f11 ;; - (p16) FMA f8 = f97, f33, f8 - (p16) FMA f9 = f97, f41, f9 + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 (p16) FMA f10 = f97, f49, f10 (p16) FMA f11 = f97, f57, f11 ;; @@ -788,8 +788,8 @@ (p16) FMA f10 = f98, f50, f10 (p16) FMA f11 = f98, f58, f11 ;; - (p16) FMA f8 = f99, f35, f8 - (p16) FMA f9 = f99, f43, f9 + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 (p16) FMA f10 = f99, f51, f10 (p16) FMA f11 = f99, f59, f11 ;; @@ -799,8 +799,8 @@ (p16) FMA f11 = f100, f60, f11 ;; - (p16) FMA f8 = f101, f37, f8 - (p16) FMA f9 = f101, f45, f9 + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 (p16) FMA f10 = f101, f53, f10 (p16) FMA f11 = f101, f61, f11 ;; @@ -809,8 +809,8 @@ (p16) FMA f10 = f102, f54, f10 (p16) FMA f11 = f102, f62, f11 ;; - (p16) FMA f8 = f103, f39, f8 - (p16) FMA f9 = f103, f47, f9 + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 (p16) FMA f10 = f103, f55, f10 (p16) FMA f11 = f103, f63, f11 br.ctop.sptk.few .L22 @@ -888,8 +888,8 @@ (p13) FMA f10 = f96, f48, f10 (p13) FMA f11 = f96, f56, f11 ;; - (p13) FMA f8 = f97, f33, f8 - (p13) FMA f9 = f97, f41, f9 + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 (p13) FMA f10 = f97, f49, f10 (p13) FMA f11 = f97, f57, f11 ;; @@ -898,8 +898,8 @@ (p13) FMA f10 = f98, f50, f10 (p13) FMA f11 = f98, f58, f11 ;; - (p13) FMA f8 = f99, f35, f8 - (p13) FMA f9 = f99, f43, f9 + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 (p13) FMA f10 = f99, f51, f10 (p13) FMA f11 = f99, f59, f11 ;; @@ -908,8 +908,8 @@ (p14) FMA f10 = f100, f52, f10 (p14) FMA f11 = f100, f60, f11 ;; - (p14) FMA f8 = f101, f37, f8 - (p14) FMA f9 = f101, f45, f9 + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 (p14) FMA f10 = f101, f53, f10 (p14) FMA f11 = f101, f61, f11 ;; @@ -962,10 +962,10 @@ ;; shladd LDA7M8 = LDA, 1, r0 ;; - sub LDA7M8 = LDA, LDA7M8 + sub LDA7M8 = LDA, LDA7M8 ;; adds LDA7M8 = 8 * SIZE, LDA7M8 - ;; + ;; mov f8 = f0 mov f9 = f0 mov f10 = f0 @@ -1028,26 +1028,26 @@ (p16) FMA f8 = f96, f32, f8 (p16) FMA f9 = f96, f40, f9 ;; - (p16) FMA f8 = f97, f33, f8 - (p16) FMA f9 = f97, f41, f9 + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 ;; (p16) FMA f8 = f98, f34, f8 (p16) FMA f9 = f98, f42, f9 ;; - (p16) FMA f8 = f99, f35, f8 - (p16) FMA f9 = f99, f43, f9 + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 ;; (p16) FMA f8 = f100, f36, f8 (p16) FMA f9 = f100, f44, f9 ;; - (p16) FMA f8 = f101, f37, f8 - (p16) FMA f9 = f101, f45, f9 + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 ;; (p16) FMA f8 = f102, f38, f8 (p16) FMA f9 = f102, f46, f9 ;; - (p16) FMA f8 = f103, f39, f8 - (p16) FMA f9 = f103, f47, f9 + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 br.ctop.sptk.few .L32 ;; .align 16 @@ -1103,20 +1103,20 @@ (p13) FMA f8 = f96, f32, f8 (p13) FMA f9 = f96, f40, f9 ;; - (p13) FMA f8 = f97, f33, f8 - (p13) FMA f9 = f97, f41, f9 + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 ;; (p13) FMA f8 = f98, f34, f8 (p13) FMA f9 = f98, f42, f9 ;; - (p13) FMA f8 = f99, f35, f8 - (p13) FMA f9 = f99, f43, f9 + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 ;; (p14) FMA f8 = f100, f36, f8 (p14) FMA f9 = f100, f44, f9 ;; - (p14) FMA f8 = f101, f37, f8 - (p14) FMA f9 = f101, f45, f9 + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 ;; (p15) FMA f8 = f102, f38, f8 (p15) FMA f9 = f102, f46, f9 @@ -1202,19 +1202,19 @@ ;; (p16) FMA f8 = f96, f32, f8 ;; - (p16) FMA f8 = f97, f33, f8 + (p16) FMA f8 = f97, f33, f8 ;; (p16) FMA f8 = f98, f34, f8 ;; - (p16) FMA f8 = f99, f35, f8 + (p16) FMA f8 = f99, f35, f8 ;; (p16) FMA f8 = f100, f36, f8 ;; - (p16) FMA f8 = f101, f37, f8 + (p16) FMA f8 = f101, f37, f8 ;; (p16) FMA f8 = f102, f38, f8 ;; - (p16) FMA f8 = f103, f39, f8 + (p16) FMA f8 = f103, f39, f8 br.ctop.sptk.few .L42 ;; .align 16 @@ -1260,15 +1260,15 @@ ;; (p13) FMA f8 = f96, f32, f8 ;; - (p13) FMA f8 = f97, f33, f8 + (p13) FMA f8 = f97, f33, f8 ;; (p13) FMA f8 = f98, f34, f8 ;; - (p13) FMA f8 = f99, f35, f8 + (p13) FMA f8 = f99, f35, f8 ;; (p14) FMA f8 = f100, f36, f8 ;; - (p14) FMA f8 = f101, f37, f8 + (p14) FMA f8 = f101, f37, f8 ;; (p15) FMA f8 = f102, f38, f8 ;; diff --git a/kernel/ia64/qscal.S b/kernel/ia64/qscal.S index 3f978afde..7a45d9a67 100644 --- a/kernel/ia64/qscal.S +++ b/kernel/ia64/qscal.S @@ -75,7 +75,7 @@ } ;; { .mmi - mov XX = X1 + mov XX = X1 mov PR = pr } { .mmi diff --git a/kernel/ia64/saxpy.S b/kernel/ia64/saxpy.S index c3b2c1b04..fb8f9ff6e 100644 --- a/kernel/ia64/saxpy.S +++ b/kernel/ia64/saxpy.S @@ -64,7 +64,7 @@ #define XB r29 #define PR r30 #define ARLC r31 - + #define ALPHA f8 #define ALPHA_P f9 @@ -92,7 +92,7 @@ } { .mib (p10) adds N = -1, N - mov YYY = Y + mov YYY = Y (p7) br.ret.sptk.many b0 } ;; @@ -548,7 +548,7 @@ (p18) fpma f14 = ALPHA_P, f52, f100 } { .mmi - (p17) ldf8 f66 = [X], 2 * SIZE + (p17) ldf8 f66 = [X], 2 * SIZE (p16) ldf8 f86 = [Y], 2 * SIZE } ;; @@ -1485,7 +1485,7 @@ (p16) LDFD f71 = [X], INCX (p16) LDFD f119 = [Y], INCY (p17) FMA f13 = ALPHA, f48, f96 - } + } ;; { .mmi (p18) STFD [Y1] = f14 diff --git a/kernel/ia64/scal.S b/kernel/ia64/scal.S index e3d93ddc5..ad59b5808 100644 --- a/kernel/ia64/scal.S +++ b/kernel/ia64/scal.S @@ -81,7 +81,7 @@ .body ;; { .mmi - mov XX = X1 + mov XX = X1 (p10) LDFD f32 = [X1], INCX mov PR = pr } diff --git a/kernel/ia64/sdot.S b/kernel/ia64/sdot.S index 5a058e708..c611c113c 100644 --- a/kernel/ia64/sdot.S +++ b/kernel/ia64/sdot.S @@ -343,7 +343,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -514,7 +514,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -745,7 +745,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -916,7 +916,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 @@ -1148,7 +1148,7 @@ (p12) FMA f13 = f41, f43, f13 (p12) FMA f14 = f44, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f49, f51, f9 (p13) FMA f10 = f52, f54, f10 diff --git a/kernel/ia64/sgemv_n.S b/kernel/ia64/sgemv_n.S index f5949e608..e44a8eced 100644 --- a/kernel/ia64/sgemv_n.S +++ b/kernel/ia64/sgemv_n.S @@ -84,13 +84,13 @@ #define AO61 loc13 #define AO71 loc14 #define AO81 loc15 - + #define PREB r8 #define ARLC r29 #define PR r30 #define ARPFS r31 - + #ifdef DOUBLE #define RPREFETCH (16 * 3 + 8) #else @@ -120,17 +120,17 @@ ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 - ;; + ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 - ;; + ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 .body - ;; + ;; ld8 Y = [r14] ld8 INCY = [r15] @@ -3225,15 +3225,15 @@ ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 diff --git a/kernel/ia64/symv_U.S b/kernel/ia64/symv_U.S index 4f6c45143..aa125d5bb 100644 --- a/kernel/ia64/symv_U.S +++ b/kernel/ia64/symv_U.S @@ -73,14 +73,14 @@ #define A21 loc5 #define A31 loc6 #define A41 loc7 - + #define PREX r8 #define PREY r9 #define ARLC r29 #define PR r30 #define ARPFS r31 - + #ifdef DOUBLE #define RPREFETCH (16 * 3 + 4) #else @@ -119,17 +119,17 @@ ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 - ;; + ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 - ;; + ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 .body - ;; + ;; ld8 BUFFER = [r14] ;; shladd LDA = LDA, BASE_SHIFT, r0 @@ -447,15 +447,15 @@ ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 diff --git a/kernel/ia64/trsm_kernel_LN.S b/kernel/ia64/trsm_kernel_LN.S index 9b1f2b269..6c18b7205 100644 --- a/kernel/ia64/trsm_kernel_LN.S +++ b/kernel/ia64/trsm_kernel_LN.S @@ -365,7 +365,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -673,7 +673,7 @@ FNMA f112 = f104, f17, f112 ;; FNMA f120 = f104, f18, f120 - ;; + ;; FMPY f112 = f112, f19 ;; FNMA f120 = f112, f20, f120 @@ -1207,7 +1207,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -1750,7 +1750,7 @@ ;; FNMA f120 = f104, f18, f120 FNMA f121 = f105, f18, f121 - ;; + ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 ;; @@ -2584,7 +2584,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -3561,7 +3561,7 @@ FNMA f121 = f105, f18, f121 FNMA f122 = f106, f18, f122 FNMA f123 = f107, f18, f123 - ;; + ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 FMPY f114 = f114, f19 @@ -4487,7 +4487,7 @@ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 @@ -5127,7 +5127,7 @@ ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; LDFPD f32, f33 = [BOFFSET], 2 * SIZE ;; @@ -6628,7 +6628,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -7055,7 +7055,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -7699,7 +7699,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -8837,7 +8837,7 @@ ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -9228,7 +9228,7 @@ FMPY f80 = f80, f21 FMPY f88 = f88, f21 ;; - + adds BOFFSET = 24 * SIZE, BOFFSET adds BOFFSET2 = 24 * SIZE, BOFFSET2 ;; @@ -9579,7 +9579,7 @@ ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET - ;; + ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; @@ -10276,7 +10276,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -10556,7 +10556,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -10954,7 +10954,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -11612,7 +11612,7 @@ ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -12525,7 +12525,7 @@ ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -12721,7 +12721,7 @@ ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -13019,7 +13019,7 @@ ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -13493,7 +13493,7 @@ ;; shladd AOFFSET = r2, 3, AORIG add BOFFSET = r2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -14011,13 +14011,13 @@ ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 - ;; + ;; mov ar.lc = ARLC ;; mov pr = PR, -1 diff --git a/kernel/ia64/trsm_kernel_LT.S b/kernel/ia64/trsm_kernel_LT.S index eef4e000c..c11167eb1 100644 --- a/kernel/ia64/trsm_kernel_LT.S +++ b/kernel/ia64/trsm_kernel_LT.S @@ -171,7 +171,7 @@ cmp.eq p6, p7 = 0, I mov AOFFSET = A mov f80 = f0 - } + } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc @@ -496,7 +496,7 @@ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 @@ -4472,7 +4472,7 @@ LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; - + FMPY f64 = f64, f32 FMPY f65 = f65, f32 @@ -5497,7 +5497,7 @@ ;; FNMA f112 = f104, f17, f112 FNMA f113 = f105, f17, f113 - ;; + ;; { .mfi STFD [AOFFSET] = f64, SIZE FNMA f120 = f104, f18, f120 @@ -6029,7 +6029,7 @@ FNMA f112 = f104, f17, f112 ;; FNMA f120 = f104, f18, f120 - ;; + ;; FMPY f112 = f112, f19 ;; FNMA f120 = f112, f20, f120 @@ -6118,7 +6118,7 @@ setf.d f72 = r0 mov f80 = f0 shr I = M, 3 - } + } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f88 = f0 @@ -7059,7 +7059,7 @@ ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET - ;; + ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; @@ -8548,7 +8548,7 @@ ;; { .mfi shr I = M, 3 - } + } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LT @@ -9987,7 +9987,7 @@ { .mfi shr I = M, 3 - } + } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LT @@ -11010,13 +11010,13 @@ ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 - ;; + ;; mov ar.lc = ARLC ;; mov pr = PR, -1 diff --git a/kernel/ia64/trsm_kernel_RT.S b/kernel/ia64/trsm_kernel_RT.S index f3482aecd..5e0911275 100644 --- a/kernel/ia64/trsm_kernel_RT.S +++ b/kernel/ia64/trsm_kernel_RT.S @@ -234,7 +234,7 @@ { .mfi shr I = M, 3 - } + } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LN @@ -467,7 +467,7 @@ ;; shladd AOFFSET = r2, 3, AORIG add BOFFSET = r2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -1103,7 +1103,7 @@ ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -1463,7 +1463,7 @@ ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -1711,7 +1711,7 @@ ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -1846,7 +1846,7 @@ ;; { .mfi shr I = M, 3 - } + } { .mfi mov C1 = C // coffset1 = c + 0 * ldc #ifdef LN @@ -2184,7 +2184,7 @@ ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -3097,7 +3097,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -3585,7 +3585,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -3914,7 +3914,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -4109,7 +4109,7 @@ setf.d f72 = r0 mov f80 = f0 shr I = M, 3 - } + } { .mfi mov C1 = C // coffset1 = c + 0 * ldc mov f88 = f0 @@ -4656,7 +4656,7 @@ ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -5047,7 +5047,7 @@ FMPY f80 = f80, f21 FMPY f88 = f88, f21 ;; - + adds BOFFSET = 24 * SIZE, BOFFSET adds BOFFSET2 = 24 * SIZE, BOFFSET2 ;; @@ -5398,7 +5398,7 @@ ;; LDFPD f37, f38 = [BOFFSET] adds BOFFSET = 4 * SIZE, BOFFSET - ;; + ;; LDFPD f39, f40 = [BOFFSET] adds BOFFSET = 5 * SIZE, BOFFSET ;; @@ -6175,7 +6175,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -6981,7 +6981,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -7469,7 +7469,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -7786,7 +7786,7 @@ mov AOFFSET = A #endif mov f80 = f0 - } + } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc @@ -8153,7 +8153,7 @@ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 @@ -8798,7 +8798,7 @@ ;; shladd AOFFSET = r2, 3, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -13126,7 +13126,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -13937,7 +13937,7 @@ LDFD f21 = [BOFFSET] adds BOFFSET = -63 * SIZE, BOFFSET ;; - + FMPY f64 = f64, f32 FMPY f65 = f65, f32 @@ -14103,7 +14103,7 @@ FNMA f121 = f105, f18, f121 FNMA f122 = f106, f18, f122 FNMA f123 = f107, f18, f123 - ;; + ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 FMPY f114 = f114, f19 @@ -14972,7 +14972,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -15515,7 +15515,7 @@ ;; FNMA f120 = f104, f18, f120 FNMA f121 = f105, f18, f121 - ;; + ;; FMPY f112 = f112, f19 FMPY f113 = f113, f19 ;; @@ -16061,7 +16061,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 3, B - ;; + ;; #endif adds AOFFSET2 = 4 * SIZE, AOFFSET adds BOFFSET2 = 4 * SIZE, BOFFSET @@ -16369,7 +16369,7 @@ FNMA f112 = f104, f17, f112 ;; FNMA f120 = f104, f18, f120 - ;; + ;; FMPY f112 = f112, f19 ;; FNMA f120 = f112, f20, f120 @@ -16671,13 +16671,13 @@ ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 - ;; + ;; mov ar.lc = ARLC ;; mov pr = PR, -1 diff --git a/kernel/ia64/xdot.S b/kernel/ia64/xdot.S index 9322b4bc0..cdf85cc02 100644 --- a/kernel/ia64/xdot.S +++ b/kernel/ia64/xdot.S @@ -112,7 +112,7 @@ ;; shl r26 = r26, ZBASE_SHIFT shl r27 = r27, ZBASE_SHIFT - ;; + ;; (p6) add X1 = r26, X1 (p7) add Y1 = r27, Y1 ;; @@ -481,12 +481,12 @@ (p13) FMA f13 = f43, f58, f13 (p13) FMA f14 = f42, f59, f14 (p13) FMA f15 = f43, f59, f15 - ;; + ;; (p14) FMA f8 = f44, f60, f8 (p14) FMA f9 = f45, f60, f9 (p14) FMA f10 = f44, f61, f10 (p14) FMA f11 = f45, f61, f11 - ;; + ;; .align 32 .L999: @@ -505,7 +505,7 @@ #endif ;; .align 32 - + .L1000: #ifdef F_INTERFACE STFD [r32] = f8, SIZE diff --git a/kernel/ia64/zcopy.S b/kernel/ia64/zcopy.S index 91d90e0a8..90c09bbfe 100644 --- a/kernel/ia64/zcopy.S +++ b/kernel/ia64/zcopy.S @@ -75,7 +75,7 @@ PROLOGUE .prologue PROFCODE - + { .mmi shladd INCX = INCX, ZBASE_SHIFT, r0 shladd INCY = INCY, ZBASE_SHIFT, r0 diff --git a/kernel/ia64/zdot.S b/kernel/ia64/zdot.S index 5c77ce6ef..35032b767 100644 --- a/kernel/ia64/zdot.S +++ b/kernel/ia64/zdot.S @@ -98,7 +98,7 @@ LDINT N = [N] LDINT INCX = [INCX] LDINT INCY = [INCY] - ;; + ;; #ifndef USE64BITINT sxt4 N = N sxt4 INCX = INCX @@ -442,7 +442,7 @@ (p12) FMA f13 = f44, f47, f13 (p12) FMA f14 = f45, f46, f14 (p12) FMA f15 = f45, f47, f15 - ;; + ;; (p13) FMA f8 = f48, f50, f8 (p13) FMA f9 = f48, f51, f9 (p13) FMA f10 = f49, f50, f10 @@ -451,7 +451,7 @@ (p13) FMA f13 = f52, f55, f13 (p13) FMA f14 = f53, f54, f14 (p13) FMA f15 = f53, f55, f15 - ;; + ;; (p14) FMA f8 = f56, f58, f8 (p14) FMA f9 = f56, f59, f9 (p14) FMA f10 = f57, f58, f10 @@ -474,7 +474,7 @@ #endif ;; .align 32 - + .L1000: #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) STFD [r32] = f8, SIZE diff --git a/kernel/ia64/zgemm3m_kernel.S b/kernel/ia64/zgemm3m_kernel.S index 5adb66a3c..dc6d252b0 100644 --- a/kernel/ia64/zgemm3m_kernel.S +++ b/kernel/ia64/zgemm3m_kernel.S @@ -117,11 +117,11 @@ nop __LINE__ nop __LINE__ } - ;; + ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 shr J = N, 3 - ;; + ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 shladd LDC = LDC, ZBASE_SHIFT, r0 @@ -129,17 +129,17 @@ stf.spill [r8] = f22, 32 stf.spill [r9] = f23, 32 mov AOFFSET = A - ;; + ;; stf.spill [r8] = f24, 32 stf.spill [r9] = f25, 32 cmp.ge p6, p0 = 0, J - ;; + ;; stf.spill [r8] = f26, 32 stf.spill [r9] = f27, 32 - ;; + ;; stf.spill [r8] = f28, 32 stf.spill [r9] = f29, 32 - ;; + ;; stf.spill [r8] = f30 stf.spill [r9] = f31 (p6) br.cond.dpnt .L050 @@ -162,7 +162,7 @@ cmp.eq p6, p7 = 0, I nop __LINE__ mov f80 = f0 - } + } { .mmf add C2 = LDC, C // coffset2 = c + 1 * ldc shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc @@ -492,7 +492,7 @@ FMA f106 = f34, f53, f106 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfi FMA f114 = f34, f54, f114 // A3 * B7 @@ -3450,7 +3450,7 @@ cmp.eq p6, p7 = 0, I mov f65 = f0 nop __LINE__ - } + } { .mfi shladd C4 = LDC, 1, C2 mov f73 = f0 @@ -6705,7 +6705,7 @@ adds L = 1, K } ;; - { .mii + { .mii LDFD f32 = [AOFFSET], 1 * SIZE tbit.z p12, p0 = L, 0 shr L = L, 1 @@ -6774,13 +6774,13 @@ ;; ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9], 32 mov ar.lc = ARLC diff --git a/kernel/ia64/zgemm_beta.S b/kernel/ia64/zgemm_beta.S index 00cf3e95f..654cb86e0 100644 --- a/kernel/ia64/zgemm_beta.S +++ b/kernel/ia64/zgemm_beta.S @@ -77,7 +77,7 @@ { .mfb cmp.ge p6, p0 = 0, N fcmp.eq p0, p14 = BETA_R, f0 - (p6) br.ret.sptk.many b0 + (p6) br.ret.sptk.many b0 } ;; .body @@ -95,7 +95,7 @@ { .mmb cmp.ge p6, p0 = 0, M adds I = -1, I - (p6) br.ret.sptk.many b0 + (p6) br.ret.sptk.many b0 } ;; { .mbb @@ -199,7 +199,7 @@ { .mmi (p12) STFD [CO1] = f0, 1 * SIZE (p12) STFD [CO2] = f0, 1 * SIZE - (p12) adds CO3 = 8 * SIZE, CO3 + (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi @@ -397,7 +397,7 @@ { .mmi (p12) LDFD f34 = [CO1], 1 * SIZE (p12) LDFD f38 = [CO2], 1 * SIZE - (p12) adds CO3 = 8 * SIZE, CO3 + (p12) adds CO3 = 8 * SIZE, CO3 } ;; { .mmi @@ -462,7 +462,7 @@ (p12) STFD [DO2] = f36, 1 * SIZE } { .mmf - (p12) adds DO3 = 8 * SIZE, DO3 + (p12) adds DO3 = 8 * SIZE, DO3 } ;; { .mmf @@ -470,7 +470,7 @@ (p12) STFD [DO2] = f37, 1 * SIZE } { .mmf - (p13) adds DO3 = 4 * SIZE, DO3 + (p13) adds DO3 = 4 * SIZE, DO3 } ;; { .mmf diff --git a/kernel/ia64/zgemm_kernel.S b/kernel/ia64/zgemm_kernel.S index bfdb92cb8..34207c50b 100644 --- a/kernel/ia64/zgemm_kernel.S +++ b/kernel/ia64/zgemm_kernel.S @@ -462,7 +462,7 @@ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 @@ -2112,7 +2112,7 @@ mov f82 = f0 tbit.z p12, p0 = L, 0 } - { .mfi + { .mfi LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 @@ -2134,7 +2134,7 @@ mov f114 = f0 mov ar.lc = L } - { .mfi + { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ @@ -2650,7 +2650,7 @@ FMA f120 = ALPHA_R, f112, f120 nop __LINE__ } - ;; + ;; { .mfb STFD [C1] = f73, SIZE FCALC_C f105 = ALPHA_R, f97, f105 @@ -2661,7 +2661,7 @@ FCALC_C f121 = ALPHA_R, f113, f121 nop __LINE__ } - ;; + ;; { .mfb STFD [C1] = f74, SIZE FMA f106 = ALPHA_R, f98, f106 @@ -2672,7 +2672,7 @@ FMA f122 = ALPHA_R, f114, f122 nop __LINE__ } - ;; + ;; { .mfb STFD [C1] = f75, SIZE FCALC_C f107 = ALPHA_R, f99, f107 @@ -3108,7 +3108,7 @@ } ;; { .mfi - LDFPD f54, f55 = [BOFFSET], 2 * SIZE + LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } @@ -5490,7 +5490,7 @@ STFD [C2] = f88, SIZE mov f80 = f0 } - ;; + ;; { .mmi STFD [C1] = f73, SIZE STFD [C2] = f89, SIZE @@ -5585,7 +5585,7 @@ nop __LINE__ #endif } - ;; + ;; { .mmi STFD [C1] = f73, SIZE STFD [C2] = f89, SIZE @@ -6772,7 +6772,7 @@ setf.d f64 = r0 mov f80 = f0 } - ;; + ;; { .mmf STFD [C1] = f73, SIZE setf.d f65 = r0 @@ -6807,7 +6807,7 @@ setf.d f64 = r0 mov f80 = f0 } - ;; + ;; { .mmf STFD [C1] = f73, SIZE setf.d f65 = r0 diff --git a/kernel/ia64/zgemm_ncopy.S b/kernel/ia64/zgemm_ncopy.S index e7950e990..e62a2d8e3 100644 --- a/kernel/ia64/zgemm_ncopy.S +++ b/kernel/ia64/zgemm_ncopy.S @@ -44,7 +44,7 @@ #define LD LDF8 #define ST STF8_NTA - + #define TEMP r2 #define I r14 @@ -77,7 +77,7 @@ .prologue PROFCODE - .body + .body { .mii shladd LDA= LDA, ZBASE_SHIFT, r0 mov PR = pr diff --git a/kernel/ia64/zgemv_n.S b/kernel/ia64/zgemv_n.S index b3027a68d..92294ebe1 100644 --- a/kernel/ia64/zgemv_n.S +++ b/kernel/ia64/zgemv_n.S @@ -67,7 +67,7 @@ #define YST2 r27 #define YY r28 #define XX r9 - + #define RPRE1 loc0 #define RPRE2 loc1 #define RPRE3 loc2 @@ -94,7 +94,7 @@ #define ARLC r29 #define PR r30 #define ARPFS r31 - + #ifdef DOUBLE #define RPREFETCH (16 * 2 + 8) #else @@ -148,16 +148,16 @@ ;; stf.spill [r8] = f16, 32 stf.spill [r9] = f17, 32 - ;; + ;; stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 - ;; + ;; stf.spill [r8] = f20, 32 stf.spill [r9] = f21, 32 ;; stf.spill [r8] = f22 stf.spill [r9] = f23 - ;; + ;; ld8 INCX = [r14] ld8 Y = [r15] ld8 INCY = [r16] @@ -2277,15 +2277,15 @@ ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 diff --git a/kernel/ia64/zgemv_t.S b/kernel/ia64/zgemv_t.S index 73e6df04b..831bc5013 100644 --- a/kernel/ia64/zgemv_t.S +++ b/kernel/ia64/zgemv_t.S @@ -81,7 +81,7 @@ #define CLD2 loc13 #define CST1 loc14 #define CST2 loc15 - + #define PREB r8 #define WPRE r9 #define OFFSET PREB @@ -91,7 +91,7 @@ #define ARLC r29 #define PR r30 #define ARPFS r31 - + #ifdef DOUBLE #define RPREFETCH (16 * 2 + 8) #else @@ -145,7 +145,7 @@ stf.spill [r9] = f17, 32 mov PR = pr } - ;; + ;; { .mmi stf.spill [r8] = f18, 32 stf.spill [r9] = f19, 32 @@ -164,7 +164,7 @@ adds r17 = 168, SP } .body - ;; + ;; { .mmf ld8 INCX = [r14] ld8 Y = [r15] @@ -2001,15 +2001,15 @@ ldf.fill f16 = [SP], 32 ldf.fill f17 = [r9], 32 mov ar.lc = ARLC - ;; + ;; ldf.fill f18 = [SP], 32 ldf.fill f19 = [r9], 32 mov pr = PR, -1 - ;; + ;; ldf.fill f20 = [SP], 32 ldf.fill f21 = [r9], 32 mov ar.pfs = ARPFS - ;; + ;; ldf.fill f22 = [SP], 32 ldf.fill f23 = [r9] br.ret.sptk.many b0 diff --git a/kernel/ia64/zscal.S b/kernel/ia64/zscal.S index e97fedaee..1acc0ed4f 100644 --- a/kernel/ia64/zscal.S +++ b/kernel/ia64/zscal.S @@ -58,7 +58,7 @@ #define X1 r37 #define INCX r38 #endif - + #define X2 r16 #define Y1 r17 #define INCX3 r18 diff --git a/kernel/ia64/zswap.S b/kernel/ia64/zswap.S index 8251b14a8..165f3872d 100644 --- a/kernel/ia64/zswap.S +++ b/kernel/ia64/zswap.S @@ -93,7 +93,7 @@ cmp.gt p15, p0 = r0, N (p15) br.ret.sptk.many b0 } - ;; + ;; #ifdef XDOUBLE { .mmi ld8 X = [r14] @@ -152,7 +152,7 @@ ;; { .mmi adds PRE1 = PREFETCH_SIZE * SIZE, X - adds PRE2 = PREFETCH_SIZE * SIZE, Y + adds PRE2 = PREFETCH_SIZE * SIZE, Y mov ar.lc = I } { .mib @@ -323,7 +323,7 @@ (p16) LDFD f125 = [Y], INCYM1 (p18) add YY = YY, INCYM1 } - { .mmb + { .mmb (p16) lfetch.excl.nt1 [PRE1], INCX8 (p16) lfetch.excl.nt1 [PRE2], INCY8 br.ctop.sptk.few .L52 diff --git a/kernel/ia64/ztrsm_kernel_LN.S b/kernel/ia64/ztrsm_kernel_LN.S index ef903e35a..c8461a20f 100644 --- a/kernel/ia64/ztrsm_kernel_LN.S +++ b/kernel/ia64/ztrsm_kernel_LN.S @@ -362,7 +362,7 @@ } ;; { .mfi - (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } @@ -566,7 +566,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -1009,7 +1009,7 @@ mov f82 = f0 tbit.z p12, p0 = L, 0 } - { .mfi + { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 @@ -1031,7 +1031,7 @@ mov f114 = f0 mov ar.lc = L } - { .mfi + { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ @@ -1404,7 +1404,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -2427,7 +2427,7 @@ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 @@ -3072,7 +3072,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -5364,7 +5364,7 @@ nop __LINE__ } ;; - { .mfi + { .mfi STFD [C2 ] = f87, SIZE mov f112 = f0 adds I = -1, I @@ -7542,7 +7542,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -8003,7 +8003,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -8787,7 +8787,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -9622,7 +9622,7 @@ ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -9951,7 +9951,7 @@ ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -10432,7 +10432,7 @@ ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) diff --git a/kernel/ia64/ztrsm_kernel_LT.S b/kernel/ia64/ztrsm_kernel_LT.S index 6c7a8ca5b..88d69e23e 100644 --- a/kernel/ia64/ztrsm_kernel_LT.S +++ b/kernel/ia64/ztrsm_kernel_LT.S @@ -548,7 +548,7 @@ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 @@ -1193,7 +1193,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -3485,7 +3485,7 @@ nop __LINE__ } ;; - { .mfi + { .mfi STFD [C2 ] = f87, SIZE mov f112 = f0 adds I = -1, I @@ -5453,7 +5453,7 @@ mov f82 = f0 tbit.z p12, p0 = L, 0 } - { .mfi + { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 @@ -5475,7 +5475,7 @@ mov f114 = f0 mov ar.lc = L } - { .mfi + { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ @@ -5848,7 +5848,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -6687,7 +6687,7 @@ } ;; { .mfi - (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } @@ -6891,7 +6891,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -7819,7 +7819,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -8692,7 +8692,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -9199,7 +9199,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -9750,7 +9750,7 @@ ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -10312,7 +10312,7 @@ ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -10662,7 +10662,7 @@ ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) diff --git a/kernel/ia64/ztrsm_kernel_RT.S b/kernel/ia64/ztrsm_kernel_RT.S index 582e2e5bf..c1c0ffcd9 100644 --- a/kernel/ia64/ztrsm_kernel_RT.S +++ b/kernel/ia64/ztrsm_kernel_RT.S @@ -550,7 +550,7 @@ ;; shladd AOFFSET = r2, 2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -1112,7 +1112,7 @@ ;; shladd AOFFSET = r2, 1, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -1462,7 +1462,7 @@ ;; add AOFFSET = r2, AORIG add BOFFSET = r2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -2133,7 +2133,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -3006,7 +3006,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -3513,7 +3513,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 1, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -4064,7 +4064,7 @@ FMA_B f99 = f34, f53, f99 // A3 * B6 nop __LINE__ } - ;; + ;; /* 12 */ { .mfb FMA f114 = f34, f54, f114 // A3 * B7 @@ -4709,7 +4709,7 @@ ;; shladd AOFFSET = r2, 2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -7001,7 +7001,7 @@ nop __LINE__ } ;; - { .mfi + { .mfi STFD [C2 ] = f87, SIZE mov f112 = f0 adds I = -1, I @@ -8969,7 +8969,7 @@ mov f82 = f0 tbit.z p12, p0 = L, 0 } - { .mfi + { .mfi (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE mov f83 = f0 shr L = L, 1 @@ -8991,7 +8991,7 @@ mov f114 = f0 mov ar.lc = L } - { .mfi + { .mfi adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET mov f115 = f0 nop __LINE__ @@ -9364,7 +9364,7 @@ ;; shladd AOFFSET = r2, 1, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) @@ -10203,7 +10203,7 @@ } ;; { .mfi - (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE mov f120 = f0 mov ar.lc = L } @@ -10407,7 +10407,7 @@ ;; add AOFFSET = r2, AORIG shladd BOFFSET = r2, 2, B - ;; + ;; #endif #if defined(LN) || defined(LT) diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index fc247e473..2d03ad7fa 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -11,7 +11,7 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c ZGEMVTKERNEL = zgemv_t_loongson3a.c -SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMKERNEL = sgemm_kernel_8x4_ps.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B index df4380d11..e476c631e 100644 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ b/kernel/mips64/KERNEL.LOONGSON3B @@ -15,13 +15,13 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMONCOPY = ../generic/gemm_ncopy_2.c SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMKERNEL = ../generic/gemmkernel_2x2.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o diff --git a/kernel/mips64/amax.S b/kernel/mips64/amax.S index 30c35ba47..4467879d0 100644 --- a/kernel/mips64/amax.S +++ b/kernel/mips64/amax.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -66,7 +66,7 @@ #define s4 $f3 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/amin.S b/kernel/mips64/amin.S index 47108b1e4..c7d41a175 100644 --- a/kernel/mips64/amin.S +++ b/kernel/mips64/amin.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -66,7 +66,7 @@ #define s4 $f3 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/asum.S b/kernel/mips64/asum.S index 447c2f73d..2bf95c65d 100644 --- a/kernel/mips64/asum.S +++ b/kernel/mips64/asum.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -64,7 +64,7 @@ #define s2 $f1 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/axpy.S b/kernel/mips64/axpy.S index f7d888743..32694a99d 100644 --- a/kernel/mips64/axpy.S +++ b/kernel/mips64/axpy.S @@ -78,7 +78,7 @@ #define t4 $f21 PROLOGUE - + #ifndef __64BIT__ daddiu $sp, $sp, -16 sdc1 $f20, 0($sp) diff --git a/kernel/mips64/axpy_loongson3a.S b/kernel/mips64/axpy_loongson3a.S index 2e9361241..801885e7e 100644 --- a/kernel/mips64/axpy_loongson3a.S +++ b/kernel/mips64/axpy_loongson3a.S @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" - + #define PREFETCH_DISTANCE 48 - + #define N $4 #define X $8 @@ -113,7 +113,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define t4 $f21 PROLOGUE - + #ifndef __64BIT__ daddiu $sp, $sp, -16 sdc1 $f20, 0($sp) @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LD a7, 6 * SIZE(X) LD a8, 7 * SIZE(X) - + LD b1, 0 * SIZE(Y) LD b2, 1 * SIZE(Y) LD b3, 2 * SIZE(Y) @@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LD b6, 5 * SIZE(Y) LD b7, 6 * SIZE(Y) LD b8, 7 * SIZE(Y) - + blez I, .L13 NOP .align 5 @@ -160,17 +160,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L12: PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) - - MADD t1, b1, ALPHA, a1 + + MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 LD b1, 8 * SIZE(Y) LD b2, 9 * SIZE(Y) - + MADD t3, b3, ALPHA, a3 MADD t4, b4, ALPHA, a4 LD b3, 10 * SIZE(Y) LD b4, 11 * SIZE(Y) - + LD a1, 8 * SIZE(X) LD a2, 9 * SIZE(X) LD a3, 10 * SIZE(X) @@ -190,12 +190,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MADD t2, b6, ALPHA, a6 LD b5, 12 * SIZE(Y) LD b6, 13 * SIZE(Y) - + MADD t3, b7, ALPHA, a7 MADD t4, b8, ALPHA, a8 LD b7, 14 * SIZE(Y) - LD b8, 15 * SIZE(Y) - + LD b8, 15 * SIZE(Y) + LD a5, 12 * SIZE(X) LD a6, 13 * SIZE(X) LD a7, 14 * SIZE(X) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_2x2.S b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S index 5ded7aed0..675cad054 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_2x2.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_2x2.S @@ -144,7 +144,7 @@ #endif PROLOGUE - + LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE @@ -190,7 +190,7 @@ move KK, OFFSET #endif - daddiu J, J, -1 + daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 @@ -228,7 +228,7 @@ LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) - + MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 @@ -241,7 +241,7 @@ FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 - + FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 @@ -264,7 +264,7 @@ #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs @@ -281,7 +281,7 @@ LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) - + MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 @@ -294,7 +294,7 @@ MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) - + MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO2) @@ -313,7 +313,7 @@ .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) @@ -346,7 +346,7 @@ LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 8 * SIZE(BO) @@ -355,7 +355,7 @@ MADD4 c14, c14, a6, b6 # bxd LD a3, 10 * SIZE(AO) - LD a4, 11 * SIZE(AO) + LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 @@ -379,7 +379,7 @@ LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 12 * SIZE(BO) @@ -418,7 +418,7 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 0 * SIZE(BO) @@ -469,17 +469,17 @@ .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - daddiu PREA, PREA, 4 * SIZE - daddiu PREB, PREB, 4 * SIZE + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 @@ -624,9 +624,9 @@ #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 - daddiu CO2,CO2, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE .align 5 .L30: @@ -652,7 +652,7 @@ LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 - + LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 @@ -676,7 +676,7 @@ dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 -#else +#else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 @@ -687,14 +687,14 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MTC $0, c11 # Clear results regs MOV c12, c11 - + LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c13, c11 @@ -719,19 +719,19 @@ .L32: LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - + LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - + FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -739,14 +739,14 @@ LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd - + LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD1 c31, c31, a3, b7 # A1xB2 @@ -759,7 +759,7 @@ LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) - MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd LD b5, 12 * SIZE(BO) @@ -782,7 +782,7 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd LD b1, 0 * SIZE(BO) @@ -818,7 +818,7 @@ .L36: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx @@ -828,8 +828,8 @@ daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - - daddiu PREB, PREB, 4 * SIZE + + daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -873,8 +873,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 @@ -901,8 +901,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -935,7 +935,7 @@ move B, BO .align 5 - + .L20: andi J, N, 1 blez J, .L999 @@ -998,7 +998,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B LD a1, 0 * SIZE(AO) @@ -1032,7 +1032,7 @@ .L22: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b3, 2 * SIZE(BO) @@ -1044,14 +1044,14 @@ LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) - MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd LD b5, 4 * SIZE(BO) @@ -1071,7 +1071,7 @@ LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) - MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd LD b7, 6 * SIZE(BO) @@ -1090,11 +1090,11 @@ FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 - daddiu PREA, PREA, 16 * SIZE + daddiu PREA, PREA, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd LD b1, 0 * SIZE(BO) @@ -1127,7 +1127,7 @@ .L26: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx @@ -1224,7 +1224,7 @@ daddiu KK, KK, 2 #endif #endif - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP @@ -1270,7 +1270,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 @@ -1297,7 +1297,7 @@ # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd # gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 @@ -1306,27 +1306,27 @@ MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd -# gsLQC1(R12, F9, F8, 2) # Unroll K=1 +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd -# gsLQC1(R13, F13, F12, 2) +# gsLQC1(R13, F13, F12, 2) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd -# gsLQC1(R12, F11, F10, 3) +# gsLQC1(R12, F11, F10, 3) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu L, L, -1 -# gsLQC1(R13, F16, F15, 3) +# gsLQC1(R13, F16, F15, 3) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc @@ -1338,7 +1338,7 @@ # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 @@ -1369,7 +1369,7 @@ daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd @@ -1432,7 +1432,7 @@ daddiu KK, KK, 1 #endif - daddiu CO1,CO1, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE #endif diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index e78ad209f..489b12445 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -1,4 +1,4 @@ -##define REALNAME gemm +##define REALNAME gemm #define ASSEMBLER #include "common.h" @@ -77,7 +77,7 @@ #define F27 27 #define F26 26 #define F25 25 -#define F24 24 +#define F24 24 #define F23 23 #define F22 22 #define F21 21 @@ -85,7 +85,7 @@ #define F19 19 #define F18 18 #define F17 17 -#define F16 16 +#define F16 16 #define F15 15 #define F14 14 #define F13 13 @@ -97,10 +97,10 @@ #define F7 7 #define F6 6 #define F5 5 -#define F4 4 -#define F3 3 -#define F2 2 -#define F1 1 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 #define F0 0 #define R12 12 @@ -195,12 +195,12 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 dsll PREB, K, ZBASE_SHIFT MOV C21, C11 MOV C22, C11 - + gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C31, C11 MOV C32, C11 @@ -218,7 +218,7 @@ MOV C33, C11 MOV C34, C11 - + MOV C43, C11 MOV C44, C11 @@ -246,12 +246,12 @@ move BO, B # Reset B dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 dsll PREB, K, ZBASE_SHIFT MOV C21, C11 MOV C22, C11 - + gsLQC1(R13, F9, F8, 0) # B1 B2 MOV C31, C11 MOV C32, C11 @@ -278,7 +278,7 @@ PLU B3, B1, B1 PLU B4, B2, B2 - + FETCH $0, 8 * SIZE(CO1) blez L, .L242 FETCH $0, 8 * SIZE(CO2) @@ -349,7 +349,7 @@ MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 - + gsLQC1(R13, F13, F12, 3) # B3 B4 MADPS C11, C11, A1, B1 MADPS C21, C21, A2, B1 @@ -488,7 +488,7 @@ MADPS C34, C34, A7, B8 MADPS C44, C44, A8, B8 - + .align 4 .L247: #ifndef TRMMKERNEL @@ -644,7 +644,7 @@ MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 - + MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 @@ -748,7 +748,7 @@ MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 - + MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 @@ -853,7 +853,7 @@ MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 - + MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 @@ -1045,7 +1045,7 @@ MADD B4, B4, C21, A2 MADD B6, B6, C31, A2 MADD B8, B8, C41, A2 - + ST B1, 0 * SIZE(CO1) MUL C13, C12, A1 MUL C23, C22, A1 @@ -1073,7 +1073,7 @@ ST B6, 5 * SIZE(CO1) MADD C11, C11, C12, A2 MADD C21, C21, C22, A2 - + ST B8, 7 * SIZE(CO1) MADD C31, C31, C32, A2 MADD C41, C41, C42, A2 @@ -1391,7 +1391,7 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 MOV C21, C11 MOV C22, C11 @@ -1406,7 +1406,7 @@ FETCH $0, 8 * SIZE(CO1) MOV C24, C11 - + FETCH $0, 0 * SIZE(CO2) FETCH $0, 8 * SIZE(CO2) @@ -1416,7 +1416,7 @@ dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 2 # MR=2 -#else +#else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 @@ -1428,7 +1428,7 @@ dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 MOV C21, C11 MOV C22, C11 @@ -1443,7 +1443,7 @@ FETCH $0, 8 * SIZE(CO1) MOV C24, C11 - + FETCH $0, 0 * SIZE(CO2) FETCH $0, 8 * SIZE(CO2) @@ -1665,7 +1665,7 @@ MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 - + ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -1723,7 +1723,7 @@ MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 - + ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -1745,7 +1745,7 @@ ADD C22, A6, C22 SUB C14, C14, A7 SUB C24, C24, A8 - + LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) @@ -1782,7 +1782,7 @@ MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 - + ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -1910,7 +1910,7 @@ MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 - + ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -1958,7 +1958,7 @@ MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 - + ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -1980,7 +1980,7 @@ ADD C22, A6, C22 SUB C14, C14, A7 SUB C24, C24, A8 - + MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 @@ -2007,7 +2007,7 @@ MADD B6, B6, C12, A2 MADD B8, B8, C22, A2 - + ST B5, 0 * SIZE(CO2) ST B7, 2 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -2109,7 +2109,7 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 @@ -2125,7 +2125,7 @@ dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 -#else +#else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 @@ -2137,7 +2137,7 @@ dsra L, K, 2 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 gsLQC1(R12, F1, F0, 0) # A1 A2 @@ -2290,7 +2290,7 @@ NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 - + ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif @@ -2324,7 +2324,7 @@ NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 - + ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -2340,7 +2340,7 @@ LD A2, 160($sp) # load alpha_i ADD C12, A5, C12 SUB C14, C14, A7 - + LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) @@ -2359,7 +2359,7 @@ NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 - + ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif @@ -2429,7 +2429,7 @@ NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 - + ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif @@ -2457,7 +2457,7 @@ NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 - + ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) @@ -2473,7 +2473,7 @@ LD A2, 160($sp) # load alpha_i ADD C12, A5, C12 SUB C14, C14, A7 - + MUL B1, C11, A4 # A1 = alpha_r MUL B2, C13, A4 NMSUB B1, B1, C13, A2 # A2 = alpha_i @@ -2486,7 +2486,7 @@ NMSUB B5, B5, C14, A2 MADD B6, B6, C12, A2 - + ST B5, 0 * SIZE(CO2) ST B6, 1 * SIZE(CO2) #endif @@ -2679,7 +2679,7 @@ PLU B7, B5, B5 PLU B8, B6, B6 - + MADPS C11, C11, A1, B5 MADPS C21, C21, A2, B5 gsLQC1(R12, F5, F4, 6) # A5 A6 @@ -2757,7 +2757,7 @@ MADPS C43, C43, A8, B4 PLU B3, B1, B1 - + .align 4 .L147: #ifndef TRMMKERNEL @@ -3274,7 +3274,7 @@ FETCH $0, 0 * SIZE(CO1) FETCH $0, 8 * SIZE(CO1) - + PLU B3, B1, B1 PLU B4, B2, B2 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -3302,7 +3302,7 @@ FETCH $0, 0 * SIZE(CO1) FETCH $0, 8 * SIZE(CO1) - + PLU B3, B1, B1 blez L, .L122 PLU B4, B2, B2 @@ -3483,7 +3483,7 @@ LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i - + LD B1, 0 * SIZE(CO1) LD B3, 2 * SIZE(CO1) LD B2, 1 * SIZE(CO1) @@ -3609,7 +3609,7 @@ LD A1, 152($sp) # load alpha_r # LD A2, 0 * SIZE(A) # load alpha_r LD A2, 160($sp) # load alpha_i - + MUL B1, C11, A1 # A1 = alpha_r MUL B3, C21, A1 MUL B2, C13, A1 @@ -3854,7 +3854,7 @@ SUB C13, C13, A3 # ad'+'cb LD A4, 152($sp) # load alpha_r LD A2, 160($sp) # load alpha_i - + LD B1, 0 * SIZE(CO1) LD B2, 1 * SIZE(CO1) diff --git a/kernel/mips64/cgemm_kernel_loongson3b_2x2.S b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S index 5ded7aed0..675cad054 100644 --- a/kernel/mips64/cgemm_kernel_loongson3b_2x2.S +++ b/kernel/mips64/cgemm_kernel_loongson3b_2x2.S @@ -144,7 +144,7 @@ #endif PROLOGUE - + LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE @@ -190,7 +190,7 @@ move KK, OFFSET #endif - daddiu J, J, -1 + daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 @@ -228,7 +228,7 @@ LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) - + MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 @@ -241,7 +241,7 @@ FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 - + FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 @@ -264,7 +264,7 @@ #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs @@ -281,7 +281,7 @@ LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) - + MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 @@ -294,7 +294,7 @@ MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) - + MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO2) @@ -313,7 +313,7 @@ .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) @@ -346,7 +346,7 @@ LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 8 * SIZE(BO) @@ -355,7 +355,7 @@ MADD4 c14, c14, a6, b6 # bxd LD a3, 10 * SIZE(AO) - LD a4, 11 * SIZE(AO) + LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 @@ -379,7 +379,7 @@ LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 12 * SIZE(BO) @@ -418,7 +418,7 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 0 * SIZE(BO) @@ -469,17 +469,17 @@ .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - daddiu PREA, PREA, 4 * SIZE - daddiu PREB, PREB, 4 * SIZE + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 @@ -624,9 +624,9 @@ #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 - daddiu CO2,CO2, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE .align 5 .L30: @@ -652,7 +652,7 @@ LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 - + LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 @@ -676,7 +676,7 @@ dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 -#else +#else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 @@ -687,14 +687,14 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MTC $0, c11 # Clear results regs MOV c12, c11 - + LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c13, c11 @@ -719,19 +719,19 @@ .L32: LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - + LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - + FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -739,14 +739,14 @@ LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd - + LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD1 c31, c31, a3, b7 # A1xB2 @@ -759,7 +759,7 @@ LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) - MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd LD b5, 12 * SIZE(BO) @@ -782,7 +782,7 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd LD b1, 0 * SIZE(BO) @@ -818,7 +818,7 @@ .L36: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx @@ -828,8 +828,8 @@ daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - - daddiu PREB, PREB, 4 * SIZE + + daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -873,8 +873,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 @@ -901,8 +901,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -935,7 +935,7 @@ move B, BO .align 5 - + .L20: andi J, N, 1 blez J, .L999 @@ -998,7 +998,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B LD a1, 0 * SIZE(AO) @@ -1032,7 +1032,7 @@ .L22: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b3, 2 * SIZE(BO) @@ -1044,14 +1044,14 @@ LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) - MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd LD b5, 4 * SIZE(BO) @@ -1071,7 +1071,7 @@ LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) - MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd LD b7, 6 * SIZE(BO) @@ -1090,11 +1090,11 @@ FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 - daddiu PREA, PREA, 16 * SIZE + daddiu PREA, PREA, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd LD b1, 0 * SIZE(BO) @@ -1127,7 +1127,7 @@ .L26: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx @@ -1224,7 +1224,7 @@ daddiu KK, KK, 2 #endif #endif - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP @@ -1270,7 +1270,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 @@ -1297,7 +1297,7 @@ # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd # gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 @@ -1306,27 +1306,27 @@ MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd -# gsLQC1(R12, F9, F8, 2) # Unroll K=1 +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd -# gsLQC1(R13, F13, F12, 2) +# gsLQC1(R13, F13, F12, 2) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd -# gsLQC1(R12, F11, F10, 3) +# gsLQC1(R12, F11, F10, 3) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu L, L, -1 -# gsLQC1(R13, F16, F15, 3) +# gsLQC1(R13, F16, F15, 3) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc @@ -1338,7 +1338,7 @@ # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 @@ -1369,7 +1369,7 @@ daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd @@ -1432,7 +1432,7 @@ daddiu KK, KK, 1 #endif - daddiu CO1,CO1, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE #endif diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S index dd8c21090..76fa9c295 100644 --- a/kernel/mips64/cnrm2.S +++ b/kernel/mips64/cnrm2.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -65,7 +65,7 @@ PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) @@ -118,7 +118,7 @@ cvt.d.s t1, a5 NOP - + madd.d s2, s2, t2, t2 LD a2, 1 * SIZE(X) @@ -195,7 +195,7 @@ cvt.d.s t1, a1 cvt.d.s t2, a2 - + madd.d s1, s1, t1, t1 daddu X, X, INCX @@ -210,5 +210,5 @@ j $31 cvt.s.d s1, s1 - + EPILOGUE diff --git a/kernel/mips64/copy.S b/kernel/mips64/copy.S index 7942b1890..bf7f7c79a 100644 --- a/kernel/mips64/copy.S +++ b/kernel/mips64/copy.S @@ -44,7 +44,7 @@ #define INCX $6 #define Y $7 #define INCY $8 - + #define I $2 #define TEMP $3 @@ -58,7 +58,7 @@ #define a8 $f7 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/daxpy_loongson3a_simd.S b/kernel/mips64/daxpy_loongson3a_simd.S index 8f53441dc..880a67f02 100644 --- a/kernel/mips64/daxpy_loongson3a_simd.S +++ b/kernel/mips64/daxpy_loongson3a_simd.S @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" - + #define PREFETCH_DISTANCE 2016 - + #define N $4 #define X $8 @@ -158,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define X_BASE 8 #define Y_BASE 10 - + #define gsLQC1_(base,fq,ft,offset) .word (0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) #define gsLQC1(base,fq,ft,offset) gsLQC1_((base), (fq), (ft), (offset)) @@ -166,7 +166,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define gsSQC1(base,fq,ft,offset) gsSQC1_((base), (fq), (ft), (offset)) PROLOGUE - + #ifndef __64BIT__ daddiu $sp, $sp, -40 sdc1 $f20, 0($sp) @@ -185,7 +185,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - + li TEMP, SIZE blez N, .L999 @@ -196,9 +196,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne INCY, TEMP, .L20 - //Dose the address of Y algin 16 bytes? + //Dose the address of Y algin 16 bytes? andi TEMP, Y, 8 - beq TEMP, $0, .L10 + beq TEMP, $0, .L10 //Y unalgin. Compute this unalgined element. LD a1, 0 * SIZE(X) LD b1, 0 * SIZE(Y) @@ -208,20 +208,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MADD t1, b1, ALPHA, a1 daddiu N, N, -1 - + ST t1, -1 * SIZE(Y) blez N, .L999 .align 5 - + .L10: dsra I, N, 4 blez I, .L15 daddiu I, I, -1 - + //Y algin. We need test X address - //Dose the address of X algin 16 bytes? + //Dose the address of X algin 16 bytes? andi TEMP, X, 8 bne TEMP, $0, .L30 /// .align 5 @@ -242,16 +242,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. gsLQC1(Y_BASE,B4,B3,1) gsLQC1(Y_BASE,B6,B5,2) gsLQC1(Y_BASE,B8,B7,3) - + blez I, .L13 NOP .align 5 .L12: - - MADD t1, b1, ALPHA, a1 + + MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 - gsSQC1(Y_BASE, T2, T1, 0) + gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) MADD t3, b3, ALPHA, a3 @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MADD t1, b5, ALPHA, a5 MADD t2, b6, ALPHA, a6 - gsSQC1(Y_BASE, T2, T1, 2) + gsSQC1(Y_BASE, T2, T1, 2) gsLQC1(Y_BASE,B6,B5,6) MADD t3, b7, ALPHA, a7 @@ -275,9 +275,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) - MADD t1, b1, ALPHA, a9 + MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 - gsSQC1(Y_BASE, T2, T1, 4) + gsSQC1(Y_BASE, T2, T1, 4) gsLQC1(Y_BASE,B2,B1,8) MADD t3, b3, ALPHA, a11 @@ -288,16 +288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) - MADD t1, b5, ALPHA, a13 + MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 - gsSQC1(Y_BASE, T2, T1, 6) + gsSQC1(Y_BASE, T2, T1, 6) gsLQC1(Y_BASE,B6,B5,10) MADD t3, b7, ALPHA, a15 MADD t4, b8, ALPHA, a16 gsSQC1(Y_BASE, T4, T3, 7) gsLQC1(Y_BASE,B8,B7,11) - + PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) @@ -314,7 +314,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. daddiu I, I, -1 daddiu Y, Y, 16 * SIZE - + daddiu X, X, 16 * SIZE bgtz I, .L12 @@ -322,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13: - MADD t1, b1, ALPHA, a1 + MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) @@ -344,7 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. gsLQC1(Y_BASE,B8,B7,7) - MADD t1, b1, ALPHA, a9 + MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 gsSQC1(Y_BASE, T2, T1, 4) @@ -354,7 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. gsSQC1(Y_BASE, T4, T3, 5) - MADD t1, b5, ALPHA, a13 + MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 gsSQC1(Y_BASE, T2, T1, 6) @@ -413,7 +413,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L30: //Y align, X unalign, INCX==INCY==1 //unloop 16 - + LD a1, 0 * SIZE(X) daddiu X, X, SIZE gsLQC1(X_BASE,A3,A2,0) @@ -426,18 +426,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. gsLQC1(X_BASE,A15,A14,6) LD a16, 14 * SIZE(X) - + gsLQC1(Y_BASE,B2,B1,0) gsLQC1(Y_BASE,B4,B3,1) gsLQC1(Y_BASE,B6,B5,2) gsLQC1(Y_BASE,B8,B7,3) - + blez I, .L32 NOP .align 5 - + .L31: - MADD t1, b1, ALPHA, a1 + MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) @@ -463,7 +463,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) - MADD t1, b1, ALPHA, a9 + MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 gsSQC1(Y_BASE, T2, T1, 4) gsLQC1(Y_BASE,B2,B1,8) @@ -476,7 +476,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) - MADD t1, b5, ALPHA, a13 + MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 gsSQC1(Y_BASE, T2, T1, 6) gsLQC1(Y_BASE,B6,B5,10) @@ -485,7 +485,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MADD t4, b8, ALPHA, a16 gsSQC1(Y_BASE, T4, T3, 7) gsLQC1(Y_BASE,B8,B7,11) - + PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) @@ -502,15 +502,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. daddiu I, I, -1 daddiu Y, Y, 16 * SIZE - + daddiu X, X, 16 * SIZE bgtz I, .L31 - + .align 5 //Loop end: .L32: - - MADD t1, b1, ALPHA, a1 + + MADD t1, b1, ALPHA, a1 MADD t2, b2, ALPHA, a2 gsSQC1(Y_BASE, T2, T1, 0) gsLQC1(Y_BASE,B2,B1,4) @@ -532,7 +532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. gsLQC1(Y_BASE,B8,B7,7) - MADD t1, b1, ALPHA, a9 + MADD t1, b1, ALPHA, a9 MADD t2, b2, ALPHA, a10 gsSQC1(Y_BASE, T2, T1, 4) @@ -542,7 +542,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. gsSQC1(Y_BASE, T4, T3, 5) - MADD t1, b5, ALPHA, a13 + MADD t1, b5, ALPHA, a13 MADD t2, b6, ALPHA, a14 gsSQC1(Y_BASE, T2, T1, 6) @@ -558,8 +558,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //jump back to the remain process. b .L15 .align 5 - -//INCX!=1 or INCY != 1 + +//INCX!=1 or INCY != 1 .L20: dsra I, N, 3 move YY, Y diff --git a/kernel/mips64/dgemm_kernel_loongson3a_4x4.S b/kernel/mips64/dgemm_kernel_loongson3a_4x4.S index 3e95a3ed4..025f256f5 100644 --- a/kernel/mips64/dgemm_kernel_loongson3a_4x4.S +++ b/kernel/mips64/dgemm_kernel_loongson3a_4x4.S @@ -109,7 +109,7 @@ #define F27 27 #define F26 26 #define F25 25 -#define F24 24 +#define F24 24 #define F23 23 #define F22 22 #define F21 21 @@ -117,7 +117,7 @@ #define F19 19 #define F18 18 #define F17 17 -#define F16 16 +#define F16 16 #define F15 15 #define F14 14 #define F13 13 @@ -129,14 +129,14 @@ #define F7 7 #define F6 6 #define F5 5 -#define F4 4 -#define F3 3 -#define F2 2 -#define F1 1 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 #define F0 0 PROLOGUE - + daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) @@ -159,7 +159,7 @@ ST $f23,144($sp) - .align 5 + .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M @@ -169,26 +169,26 @@ move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 - + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 - + #if defined(TRMMKERNEL) - LDARG OFFSET,160($sp) # OFFSET is relate to the data part + LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) - neg KK,OFFSET + neg KK,OFFSET #endif - + move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 - move CO1,C + move CO1,C dsra M,MCO,2 # M=MCO/2 - + move A,AO # Reset A daddu CO2,C,LDC @@ -199,7 +199,7 @@ daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) - move KK,OFFSET + move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj @@ -227,25 +227,25 @@ MOV t12,t11 MOV t22,t11 gsLQC1(R8,F3,F2,1) # a2,a3 - + MOV t32,t11 MOV t42,t11 gsLQC1(R9,F11,F10,1) # b2,b3 MOV t13,t11 MOV t23,t11 - + MOV t33,t11 MOV t43,t11 MOV t14,t11 MOV t24,t11 - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) - daddiu TEMP, KK, 4 # S=L,U=L + daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif @@ -254,7 +254,7 @@ beqz K,.L15 MOV t44,t11 -#else +#else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 gsLQC1(R8,F1,F0,0) # a0,a1 @@ -266,42 +266,42 @@ MOV t41,t11 MOV t12,t11 gsLQC1(R8,F3,F2,1) # a2,a3 - + MOV t22,t11 MOV t32,t11 gsLQC1(R9,F11,F10,1) # b2,b3 MOV t42,t11 dsra K,KCO,2 # K=KCO/2 - + MOV t13,t11 MOV t23,t11 - + MOV t33,t11 MOV t43,t11 MOV t14,t11 MOV t24,t11 - + MOV t34,t11 beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif - + .align 5 .L11: # kr=4 - gsLQC1(R8,F5,F4,2) + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 - + gsLQC1(R9,F15,F14,3) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 @@ -309,17 +309,17 @@ FETCH $0,(PREB) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 - + FETCH $0,(PREA) MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 - + .L12: gsLQC1(R8,F1,F0,4) MADD t11,t11,a4,b4 @@ -347,12 +347,12 @@ FETCH $0,4*SIZE(PREA) MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 - + MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 .L13: - gsLQC1(R8,F5,F4,6) + gsLQC1(R8,F5,F4,6) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -383,9 +383,9 @@ MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 - + .L14: - gsLQC1(R8,F1,F0,0) + gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 @@ -413,7 +413,7 @@ MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREB,PREB,16*SIZE - + MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 bnez K,.L11 @@ -421,19 +421,19 @@ .L15: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop -.L16: - gsLQC1(R8,F5,F4,2) +.L16: + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -460,7 +460,7 @@ MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 - + .L17: gsLQC1(R8,F1,F0,0) MADD t11,t11,a4,b4 @@ -490,19 +490,19 @@ MADD t33,t33,a6,b6 MADD t43,t43,a7,b6 daddu PREA,PREA,8*SIZE - + MADD t34,t34,a6,b7 MADD t44,t44,a7,b7 - + .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L19 + beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -534,8 +534,8 @@ MADD t44,t44,a3,b3 .L19: # Write Back to C -#ifndef TRMMKERNEL - LD c11,0(CO1) # GEMM write part +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -605,11 +605,11 @@ daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB - - bnez M,.L10 + + bnez M,.L10 daddu CO4,CO4,4*SIZE -#else +#else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 @@ -650,7 +650,7 @@ daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE - daddiu CO4,CO4, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) @@ -663,7 +663,7 @@ FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - dsubu TEMP,KCO,KK + dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else @@ -675,10 +675,10 @@ daddu B,B,TEMP # mov B to the end of panel Bj #endif -#ifdef LEFT +#ifdef LEFT daddiu KK, KK,4 #endif - bnez M,.L10 + bnez M,.L10 nop #endif @@ -686,7 +686,7 @@ .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 - beqz M,.L14_M1 + beqz M,.L14_M1 nop .L20: @@ -694,7 +694,7 @@ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else - dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP @@ -707,11 +707,11 @@ MOV t12,t11 MOV t22,t11 gsLQC1(R9,F9,F8,0) # b0,b1 - + MOV t13,t11 MOV t23,t11 gsLQC1(R9,F11,F10,1) # b2,b3 - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK @@ -726,42 +726,42 @@ MOV t24,t11 # clear 2*4=8 results registers #else - move B,BO # Reset B + move B,BO # Reset B MTC $0,t11 - gsLQC1(R8,F1,F0,0) - + gsLQC1(R8,F1,F0,0) + MOV t21,t11 MOV t12,t11 - gsLQC1(R9,F9,F8,0) + gsLQC1(R9,F9,F8,0) MOV t22,t11 - dsra K,KCO,2 - gsLQC1(R9,F11,F10,1) - + dsra K,KCO,2 + gsLQC1(R9,F11,F10,1) + MOV t13,t11 MOV t23,t11 - + MOV t14,t11 beqz K,.L25 MOV t24,t11 #endif .L21: # nr=4,mr=2,kr=4 - gsLQC1(R8,F5,F4,1) + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 - + gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 @@ -778,7 +778,7 @@ MADD t24,t24,a5,b7 daddiu K,K,-1 - gsLQC1(R8,F7,F6,3) + gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b0 MADD t21,t21,a3,b0 @@ -811,7 +811,7 @@ bnez K,.L21 MADD t24,t24,a7,b7 -.L25: +.L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else @@ -820,12 +820,12 @@ beqz K,.L28 nop -.L26: - gsLQC1(R8,F5,F4,1) +.L26: + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,2) + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -833,7 +833,7 @@ MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 daddu A,A,4*SIZE # 2mr*2kr - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu B,B,8*SIZE # 4nr*2kr @@ -853,16 +853,16 @@ MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 - -.L28: # kr=1 + +.L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L29 + beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr @@ -880,11 +880,11 @@ .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) @@ -923,25 +923,25 @@ #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 - + ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 - + ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 - + ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) - + daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE @@ -974,7 +974,7 @@ .align 3 .L14_M1: - andi M,MCO,1 # mr=1 + andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop @@ -1010,8 +1010,8 @@ nop beqz K,.L35 nop - -#else + +#else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 @@ -1023,28 +1023,28 @@ MOV t13,t11 MOV t14,t11 gsLQC1(R9,F11,F10,1) # b2,b3 - + beqz K,.L35 nop #endif -.L31: # nr=4,mr=1,kr=4 +.L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 - + gsLQC1(R9,F13,F12,2) # b4,b5 MADD t12,t12,a0,b1 - + gsLQC1(R9,F15,F14,3) # b6,b7 MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 - + gsLQC1(R9,F9,F8,4) MADD t12,t12,a1,b5 - + gsLQC1(R9,F11,F10,5) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 @@ -1052,11 +1052,11 @@ LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 - + gsLQC1(R9,F13,F12,6) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr - + gsLQC1(R9,F15,F14,7) MADD t13,t13,a2,b2 MADD t14,t14,a2,b3 @@ -1064,10 +1064,10 @@ LD a0, 0*SIZE(A) # a0 MADD t11,t11,a3,b4 - + gsLQC1(R9,F9,F8,0) MADD t12,t12,a3,b5 - + gsLQC1(R9,F11,F10,1) MADD t13,t13,a3,b6 bnez K,.L31 @@ -1075,21 +1075,21 @@ .L35: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop -.L36: +.L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 - - gsLQC1(R9,F13,F12,2) + + gsLQC1(R9,F13,F12,2) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr - + gsLQC1(R9,F15,F14,3) MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 @@ -1099,38 +1099,38 @@ .L37: LD a0,0(A) MADD t11,t11,a1,b4 - + gsLQC1(R9,F9,F8,0) MADD t12,t12,a1,b5 - + gsLQC1(R9,F11,F10,1) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 - + .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L39 + beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 - daddu A,A,1*SIZE + daddu A,A,1*SIZE daddu B,B,4*SIZE - + MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) + LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA @@ -1176,22 +1176,22 @@ .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) - daddiu KK, KK,4 + daddiu KK, KK,4 #endif - bnez N,.L0_N4_Lb + bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj - .align 5 + .align 5 .L0_N2: andi N,NCO,2 # nr = 2 - beqz N,.L0_N1 + beqz N,.L0_N1 nop .L0_N2_Lb: - move CO1,C + move CO1,C daddu CO2,C,LDC - dsra M,MCO,2 + dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA @@ -1203,13 +1203,13 @@ beqz M,.L12_M2 nop -.L40: +.L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT - dsll TEMP, KK,1 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP @@ -1225,7 +1225,7 @@ MOV t12,t11 MOV t22,t11 gsLQC1(R8,F3,F2,1) # a2,a3 - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) @@ -1233,7 +1233,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 @@ -1250,28 +1250,28 @@ MOV t41,t11 dsra K,KCO,2 # K=KCO/2 gsLQC1(R8,F3,F2,1) # a2,a3 - + MOV t12,t11 MOV t22,t11 - + MOV t32,t11 beqz K,.L45 MOV t42,t11 #endif .L41: # nr=2,mr=kr=4 - gsLQC1(R8,F5,F4,2) + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,1) + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 gsLQC1(R8,F7,F6,3) MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 - + FETCH $0,(PREA) MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 @@ -1294,7 +1294,7 @@ MADD t42,t42,a7,b5 .L43: - gsLQC1(R8,F5,F4,6) + gsLQC1(R8,F5,F4,6) MADD t11,t11,a0,b2 MADD t21,t21,a1,b2 @@ -1305,7 +1305,7 @@ gsLQC1(R8,F7,F6,7) MADD t31,t31,a2,b2 MADD t41,t41,a3,b2 - daddu B,B,8*SIZE # 2nr*4kr + daddu B,B,8*SIZE # 2nr*4kr FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 @@ -1335,19 +1335,19 @@ .L45: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop -.L46: - gsLQC1(R8,F5,F4,2) +.L46: + gsLQC1(R8,F5,F4,2) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,1) + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -1379,16 +1379,16 @@ MADD t42,t42,a7,b5 daddu PREA,PREA,8*SIZE - + .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L49 + beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -1408,7 +1408,7 @@ .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -1429,7 +1429,7 @@ MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA - daddiu M,M,-1 + daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) @@ -1441,8 +1441,8 @@ FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) - daddu CO1,CO1,4*SIZE - bnez M,.L40 + daddu CO1,CO1,4*SIZE + bnez M,.L40 daddu CO2,CO2,4*SIZE #else @@ -1450,7 +1450,7 @@ MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 - + MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 @@ -1459,13 +1459,13 @@ ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) - + daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE @@ -1499,7 +1499,7 @@ .align 3 .L12_M2: andi M,MCO,2 # mr = 2 - beqz M,.L12_M1 + beqz M,.L12_M1 nop .L50: @@ -1525,7 +1525,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 @@ -1538,18 +1538,18 @@ MTC $0,t11 MOV t21,t11 gsLQC1(R9,F9,F8,0) #b0,b1 - + MOV t12,t11 beqz K,.L55 MOV t22,t11 #endif .L51: # nr=2 mr=2,kr=4 - gsLQC1(R8,F5,F4,1) + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - gsLQC1(R9,F13,F12,1) + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 @@ -1562,7 +1562,7 @@ MADD t22,t22,a5,b5 daddiu K,K,-1 - gsLQC1(R8,F7,F6,3) + gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 MADD t21,t21,a3,b2 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE @@ -1583,20 +1583,20 @@ .L55: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop -.L56: - gsLQC1(R8,F5,F4,1) +.L56: + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 - gsLQC1(R9,F13,F12,1) + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 daddu B,B,4*SIZE # 2nr*2kr @@ -1610,16 +1610,16 @@ MADD t12,t12,a4,b5 MADD t22,t22,a5,b5 - + .L58: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP, 1 #endif - beqz K,.L59 + beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -1632,10 +1632,10 @@ .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA @@ -1646,7 +1646,7 @@ ST t12,0(CO2) ST t22,1*SIZE(CO2) - daddu CO1,CO1,2*SIZE + daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) @@ -1692,7 +1692,7 @@ .align 3 .L12_M1: andi M,MCO,1 # mr = 1 - beqz M,.L0_N2_Loop + beqz M,.L0_N2_Loop nop .L60: @@ -1708,10 +1708,10 @@ #endif MTC $0,t11 LD a0, 0*SIZE(A) # a0 - + MOV t21,t11 gsLQC1(R9,F9,F8,0) # b0,b1 - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1719,42 +1719,42 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t12,t11 beqz K,.L65 MOV t22,t11 #else - dsra K,KCO,2 + dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) - + MTC $0,t11 MOV t21,t11 - gsLQC1(R9,F9,F8,0) + gsLQC1(R9,F9,F8,0) MOV t12,t11 beqz K,.L65 MOV t22,t11 #endif -.L61: # nr=2,mr=1,kr=4 +.L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 MADD t11,t11,a0,b0 - - gsLQC1(R9,F13,F12,1) + + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 MADD t11,t11,a4,b4 - + gsLQC1(R9,F11,F10,2) MADD t12,t12,a4,b5 LD a6, 3*SIZE(A) # a4 MADD t11,t11,a2,b2 daddiu K,K,-1 - + gsLQC1(R9,F15,F14,3) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 @@ -1762,46 +1762,46 @@ LD a0, 0*SIZE(A) MADD t11,t11,a6,b6 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE - + gsLQC1(R9,F9,F8,0) # a0 bnez K,.L61 MADD t12,t12,a6,b7 .L65: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop -.L66: +.L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 - - gsLQC1(R9,F13,F12,1) + + gsLQC1(R9,F13,F12,1) MADD t12,t12,a0,b1 daddu B,B,4*SIZE .L67: LD a0,0(A) # a0 MADD t11,t11,a4,b4 - + gsLQC1(R9,F9,F8,0) MADD t12,t12,a4,b5 - + .L68: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L69 + beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 @@ -1812,14 +1812,14 @@ #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else @@ -1829,7 +1829,7 @@ ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1859,15 +1859,15 @@ move BO, B - .align 5 + .align 5 .L0_N1: andi N,NCO,1 # nr = 1 - beqz N,.L999 + beqz N,.L999 nop - move CO1,C - dsra M,MCO,2 - + move CO1,C + dsra M,MCO,2 + move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) @@ -1877,7 +1877,7 @@ beqz M,.L11_M2 daddu C,CO1,LDC -.L70: +.L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B @@ -1891,7 +1891,7 @@ MTC $0,t11 LD b0, 0*SIZE(B) - + MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 @@ -1904,23 +1904,23 @@ #else daddiu TEMP, KK, 1 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t41,t11 beqz K,.L75 nop #else move B, BO # Reset B - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) - + MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 - + MOV t31,t11 MOV t41,t11 gsLQC1(R8,F3,F2,1) #a2,a3 - + beqz K,.L75 nop #endif @@ -1928,8 +1928,8 @@ .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 - - gsLQC1(R8,F5,F4,2) + + gsLQC1(R8,F5,F4,2) MADD t21,t21,a1,b0 gsLQC1(R8,F7,F6,3) @@ -1952,8 +1952,8 @@ LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 - - gsLQC1(R8,F5,F4,6) + + gsLQC1(R8,F5,F4,6) MADD t21,t21,a1,b2 FETCH $0,8*SIZE(PREA) @@ -1966,7 +1966,7 @@ LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 daddu PREA,PREA,16*SIZE - + gsLQC1(R8,F1,F0,0) MADD t21,t21,a5,b6 daddiu K,K,-1 @@ -1980,19 +1980,19 @@ .L75: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop -.L76: +.L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 - - gsLQC1(R8,F5,F4,2) + + gsLQC1(R8,F5,F4,2) MADD t21,t21,a1,b0 FETCH $0,0(PREA) @@ -2004,7 +2004,7 @@ .L77: LD b0,0(B) MADD t11,t11,a4,b4 - + gsLQC1(R8,F1,F0,0) MADD t21,t21,a5,b4 FETCH $0,4*SIZE(PREA) @@ -2014,16 +2014,16 @@ MADD t41,t41,a7,b4 daddu PREA,PREA,8*SIZE - + .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L79 + beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -2038,7 +2038,7 @@ .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -2073,7 +2073,7 @@ FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) - daddu CO1,CO1,4*SIZE + daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT @@ -2092,7 +2092,7 @@ #ifdef LEFT daddiu KK, KK, 4 #endif - bnez M,.L70 + bnez M,.L70 nop #endif @@ -2100,10 +2100,10 @@ .align 3 .L11_M2: andi M,MCO,2 # mr = 2 - beqz M,.L11_M1 + beqz M,.L11_M1 nop -.L80: +.L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2117,7 +2117,7 @@ LD b0, 0*SIZE(B) MTC $0,t11 - + gsLQC1(R8,F1,F0,0) #a0,a1 MOV t21,t11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -2132,20 +2132,20 @@ nop #else move B, BO - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 gsLQC1(R8,F1,F0,0) #a0,a1 - + beqz K,.L85 nop #endif .L81: # nr=1,mr=2,kr=4 LD b4, 1*SIZE(B) - gsLQC1(R8,F5,F4,1) + gsLQC1(R8,F5,F4,1) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -2153,7 +2153,7 @@ gsLQC1(R8,F3,F2,2) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 - + LD b6, 3*SIZE(B) gsLQC1(R8,F7,F6,3) MADD t11,t11,a2,b2 @@ -2166,44 +2166,44 @@ gsLQC1(R8,F1,F0,0) MADD t11,t11,a6,b6 MADD t21,t21,a7,b6 - + daddiu K,K,-1 bnez K,.L81 nop .L85: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop -.L86: - gsLQC1(R8,F5,F4,1) +.L86: + gsLQC1(R8,F5,F4,1) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 - + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 - + gsLQC1(R8,F1,F0,0) LD b0,0(B) MADD t11,t11,a4,b4 MADD t21,t21,a5,b4 - + .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L89 + beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -2213,7 +2213,7 @@ .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA @@ -2222,7 +2222,7 @@ ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) - + daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else @@ -2257,10 +2257,10 @@ .align 3 .L11_M1: andi M,MCO,1 # mr = 1 - beqz M,.L999 + beqz M,.L999 nop -.L90: +.L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2289,7 +2289,7 @@ move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) - dsra K,KCO,2 + dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif @@ -2298,7 +2298,7 @@ LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - + LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 @@ -2306,28 +2306,28 @@ LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 - + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 - + daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop -.L96: +.L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 @@ -2337,14 +2337,14 @@ LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 - + .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L99 + beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 diff --git a/kernel/mips64/dgemm_kernel_loongson3b_4x4.S b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S index 4a8c9b0e4..10c5f47de 100644 --- a/kernel/mips64/dgemm_kernel_loongson3b_4x4.S +++ b/kernel/mips64/dgemm_kernel_loongson3b_4x4.S @@ -110,7 +110,7 @@ #define F27 27 #define F26 26 #define F25 25 -#define F24 24 +#define F24 24 #define F23 23 #define F22 22 #define F21 21 @@ -118,7 +118,7 @@ #define F19 19 #define F18 18 #define F17 17 -#define F16 16 +#define F16 16 #define F15 15 #define F14 14 #define F13 13 @@ -130,14 +130,14 @@ #define F7 7 #define F6 6 #define F5 5 -#define F4 4 -#define F3 3 -#define F2 2 -#define F1 1 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 #define F0 0 PROLOGUE - + daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) @@ -160,7 +160,7 @@ ST $f23,144($sp) - .align 5 + .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M @@ -170,26 +170,26 @@ move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 - + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 - + #if defined(TRMMKERNEL) - LDARG OFFSET,160($sp) # OFFSET is relate to the data part + LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) - neg KK,OFFSET + neg KK,OFFSET #endif - + move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 - move CO1,C + move CO1,C dsra M,MCO,2 # M=MCO/2 - + move A,AO # Reset A daddu CO2,C,LDC @@ -200,7 +200,7 @@ daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) - move KK,OFFSET + move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj @@ -227,18 +227,18 @@ MOV t41,t11 MOV t12,t11 LD b0,0(B) - + MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 LD a2,2*SIZE(A) - + MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) - + MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) @@ -250,7 +250,7 @@ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) - daddiu TEMP, KK, 4 # S=L,U=L + daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif @@ -259,7 +259,7 @@ beqz K,.L15 MOV t44,t11 -#else +#else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) @@ -271,7 +271,7 @@ MOV t41,t11 MOV t12,t11 LD b0,0(B) - + MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) @@ -279,11 +279,11 @@ MOV t42,t11 dsra K,KCO,2 # K=KCO/2 LD a2,2*SIZE(A) - + MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) - + MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) @@ -296,7 +296,7 @@ beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif - + .align 5 .L11: # kr=4 MADD t11,t11,a0,b0 @@ -306,29 +306,29 @@ MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) - + MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) - + MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,(PREB) - + MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 LD b6,6*SIZE(B) FETCH $0,(PREA) - + MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 LD a7,7*SIZE(A) - + MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,7*SIZE(B) @@ -447,14 +447,14 @@ .L15: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop -.L16: +.L16: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) @@ -528,16 +528,16 @@ daddu PREB,PREB,8*SIZE LD b3,3*SIZE(B) - + .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L19 + beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -569,8 +569,8 @@ MADD t44,t44,a3,b3 .L19: # Write Back to C -#ifndef TRMMKERNEL - LD c11,0(CO1) # GEMM write part +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -640,11 +640,11 @@ daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB - - bnez M,.L10 + + bnez M,.L10 daddu CO4,CO4,4*SIZE -#else +#else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 @@ -685,7 +685,7 @@ daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE - daddiu CO4,CO4, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) @@ -698,7 +698,7 @@ FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - dsubu TEMP,KCO,KK + dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else @@ -710,10 +710,10 @@ daddu B,B,TEMP # mov B to the end of panel Bj #endif -#ifdef LEFT +#ifdef LEFT daddiu KK, KK,4 #endif - bnez M,.L10 + bnez M,.L10 nop #endif @@ -721,7 +721,7 @@ .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 - beqz M,.L14_M1 + beqz M,.L14_M1 nop .L20: @@ -729,7 +729,7 @@ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else - dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP @@ -738,7 +738,7 @@ LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) - + MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 @@ -764,18 +764,18 @@ MOV t24,t11 # clear 2*4=8 results registers #else - move B,BO # Reset B + move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) - + MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 - dsra K,KCO,2 + dsra K,KCO,2 LD b2,2*SIZE(B) MOV t13,t11 @@ -806,7 +806,7 @@ MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 - + MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 @@ -866,7 +866,7 @@ MADD t24,t24,a7,b7 -.L25: +.L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else @@ -875,7 +875,7 @@ beqz K,.L28 nop -.L26: +.L26: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 @@ -890,7 +890,7 @@ LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,4*SIZE # 2mr*2kr @@ -915,16 +915,16 @@ MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 - -.L28: # kr=1 + +.L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L29 + beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr @@ -942,11 +942,11 @@ .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) @@ -985,25 +985,25 @@ #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 - + ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 - + ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 - + ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) - + daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE @@ -1036,7 +1036,7 @@ .align 3 .L14_M1: - andi M,MCO,1 # mr=1 + andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop @@ -1056,13 +1056,13 @@ MTC $0,t11 LD b0,0*SIZE(B) - + MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) - + MOV t14,t11 LD b3,3*SIZE(B) @@ -1077,35 +1077,35 @@ nop beqz K,.L35 nop - -#else + +#else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) - + MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) - + MOV t14,t11 beqz K,.L35 LD b3,3*SIZE(B) #endif -.L31: # nr=4,mr=1,kr=4 +.L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 - + LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 - + LD b6,6*SIZE(B) LD b7,7*SIZE(B) MADD t13,t13,a0,b2 @@ -1113,11 +1113,11 @@ LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 - + LD b0,8*SIZE(B) LD b1,9*SIZE(B) MADD t12,t12,a1,b5 - + LD b2,10*SIZE(B) LD b3,11*SIZE(B) MADD t13,t13,a1,b6 @@ -1126,12 +1126,12 @@ LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 daddiu K,K,-1 - + LD b4,12*SIZE(B) LD b5,13*SIZE(B) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr - + LD b6,14*SIZE(B) LD b7,15*SIZE(B) MADD t13,t13,a2,b2 @@ -1140,7 +1140,7 @@ LD a0, 0*SIZE(A) # a0 daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a3,b4 - + LD b0,0*SIZE(B) MADD t12,t12,a3,b5 LD b1,1*SIZE(B) @@ -1154,14 +1154,14 @@ .L35: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop -.L36: +.L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 @@ -1169,10 +1169,10 @@ LD b5,5*SIZE(B) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr - + LD b6,6*SIZE(B) MADD t13,t13,a0,b2 - + LD b7,7*SIZE(B) MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr @@ -1181,41 +1181,41 @@ .L37: LD a0,0(A) MADD t11,t11,a1,b4 - + LD b0,0*SIZE(B) LD b1,1*SIZE(B) MADD t12,t12,a1,b5 - + LD b2,2*SIZE(B) LD b3,3*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 - - + + .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L39 + beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 - daddu A,A,1*SIZE + daddu A,A,1*SIZE daddu B,B,4*SIZE - + MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) + LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA @@ -1261,22 +1261,22 @@ .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) - daddiu KK, KK,4 + daddiu KK, KK,4 #endif - bnez N,.L0_N4_Lb + bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj - .align 5 + .align 5 .L0_N2: andi N,NCO,2 # nr = 2 - beqz N,.L0_N1 + beqz N,.L0_N1 nop .L0_N2_Lb: - move CO1,C + move CO1,C daddu CO2,C,LDC - dsra M,MCO,2 + dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA @@ -1288,13 +1288,13 @@ beqz M,.L12_M2 nop -.L40: +.L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT - dsll TEMP, KK,1 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP @@ -1311,10 +1311,10 @@ MOV t41,t11 LD a2,2*SIZE(A) LD a3,3*SIZE(A) - + MOV t12,t11 MOV t22,t11 - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) @@ -1322,7 +1322,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 @@ -1342,10 +1342,10 @@ LD a2,2*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a3,3*SIZE(A) - + MOV t12,t11 MOV t22,t11 - + MOV t32,t11 beqz K,.L45 MOV t42,t11 @@ -1411,9 +1411,9 @@ FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 - + daddu A,A,16*SIZE # 4mr*4kr - daddu B,B,8*SIZE # 2nr*4kr + daddu B,B,8*SIZE # 2nr*4kr .L44: MADD t11,t11,a4,b6 @@ -1443,14 +1443,14 @@ .L45: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop -.L46: +.L46: MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 @@ -1469,7 +1469,7 @@ FETCH $0,0(PREA) MADD t32,t32,a2,b1 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 - + MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE @@ -1495,16 +1495,16 @@ daddu PREA,PREA,8*SIZE - + .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L49 + beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -1524,7 +1524,7 @@ .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -1545,7 +1545,7 @@ MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA - daddiu M,M,-1 + daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) @@ -1557,8 +1557,8 @@ FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) - daddu CO1,CO1,4*SIZE - bnez M,.L40 + daddu CO1,CO1,4*SIZE + bnez M,.L40 daddu CO2,CO2,4*SIZE #else @@ -1566,7 +1566,7 @@ MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 - + MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 @@ -1575,13 +1575,13 @@ ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) - + daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE @@ -1615,7 +1615,7 @@ .align 3 .L12_M2: andi M,MCO,2 # mr = 2 - beqz M,.L12_M1 + beqz M,.L12_M1 nop .L50: @@ -1636,7 +1636,7 @@ LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1644,7 +1644,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 @@ -1659,7 +1659,7 @@ LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) - + MOV t12,t11 beqz K,.L55 MOV t22,t11 @@ -1715,14 +1715,14 @@ .L55: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop -.L56: +.L56: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 @@ -1752,9 +1752,9 @@ #else andi K,TEMP, 1 #endif - beqz K,.L59 + beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -1767,10 +1767,10 @@ .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA @@ -1781,7 +1781,7 @@ ST t12,0(CO2) ST t22,1*SIZE(CO2) - daddu CO1,CO1,2*SIZE + daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) @@ -1827,7 +1827,7 @@ .align 3 .L12_M1: andi M,MCO,1 # mr = 1 - beqz M,.L0_N2_Loop + beqz M,.L0_N2_Loop nop .L60: @@ -1842,7 +1842,7 @@ daddu B, BO, TEMP #endif LD a0,0*SIZE(A) - + MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) @@ -1857,16 +1857,16 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t22,t11 beqz K,.L65 nop #else - dsra K,KCO,2 + dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) - + MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) @@ -1878,18 +1878,18 @@ #endif -.L61: # nr=2,mr=1,kr=4 +.L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 LD b4, 2*SIZE(B) MADD t11,t11,a0,b0 - + LD b5,3*SIZE(B) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 LD b2,4*SIZE(B) MADD t11,t11,a4,b4 - + LD b3,5*SIZE(B) MADD t12,t12,a4,b5 @@ -1897,17 +1897,17 @@ daddiu K,K,-1 LD b6,6*SIZE(B) MADD t11,t11,a2,b2 - + LD b7,7*SIZE(B) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE - - LD b0,0*SIZE(B) + + LD b0,0*SIZE(B) MADD t11,t11,a6,b6 - + LD b1,1*SIZE(B) bnez K,.L61 MADD t12,t12,a6,b7 @@ -1916,19 +1916,19 @@ .L65: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop -.L66: +.L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 LD b4,2*SIZE(B) daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 - + LD b5,3*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE @@ -1937,7 +1937,7 @@ LD a0,0(A) # a0 LD b0,0*SIZE(B) MADD t11,t11,a4,b4 - + LD b1,1*SIZE(B) MADD t12,t12,a4,b5 @@ -1948,9 +1948,9 @@ #else andi K,TEMP,1 #endif - beqz K,.L69 + beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 @@ -1961,14 +1961,14 @@ #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else @@ -1978,7 +1978,7 @@ ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2008,15 +2008,15 @@ move BO, B - .align 5 + .align 5 .L0_N1: andi N,NCO,1 # nr = 1 - beqz N,.L999 + beqz N,.L999 nop - move CO1,C - dsra M,MCO,2 - + move CO1,C + dsra M,MCO,2 + move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) @@ -2026,7 +2026,7 @@ beqz M,.L11_M2 daddu C,CO1,LDC -.L70: +.L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B @@ -2038,12 +2038,12 @@ daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) - + MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) - + MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 @@ -2057,19 +2057,19 @@ #else daddiu TEMP, KK, 1 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 beqz K,.L75 nop #else move B, BO # Reset B - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) - + MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) - + MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 @@ -2081,7 +2081,7 @@ .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 - + LD a4, 4*SIZE(A) MADD t21,t21,a1,b0 @@ -2097,7 +2097,7 @@ .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 - + LD a0,8*SIZE(A) MADD t21,t21,a5,b4 @@ -2106,17 +2106,17 @@ LD a2,10*SIZE(A) MADD t31,t31,a6,b4 - + LD a3,11*SIZE(A) MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 - + LD a4,12*SIZE(A) daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 - + LD a5,13*SIZE(A) MADD t21,t21,a1,b2 @@ -2131,7 +2131,7 @@ .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 - + LD a0,0*SIZE(A) daddu PREA,PREA,16*SIZE @@ -2150,20 +2150,20 @@ .L75: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop -.L76: +.L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - + LD a4,4*SIZE(A) daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 - + LD a5,5*SIZE(A) MADD t21,t21,a1,b0 FETCH $0,0(PREA) @@ -2193,16 +2193,16 @@ daddu PREA,PREA,8*SIZE - + .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L79 + beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -2217,7 +2217,7 @@ .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -2252,7 +2252,7 @@ FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) - daddu CO1,CO1,4*SIZE + daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT @@ -2271,7 +2271,7 @@ #ifdef LEFT daddiu KK, KK, 4 #endif - bnez M,.L70 + bnez M,.L70 nop #endif @@ -2279,10 +2279,10 @@ .align 3 .L11_M2: andi M,MCO,2 # mr = 2 - beqz M,.L11_M1 + beqz M,.L11_M1 nop -.L80: +.L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2312,13 +2312,13 @@ nop #else move B, BO - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) - + beqz K,.L85 LD a1,1*SIZE(A) @@ -2336,7 +2336,7 @@ MADD t11,t11,a4,b4 LD a3,5*SIZE(A) MADD t21,t21,a5,b4 - + LD b6, 3*SIZE(B) LD a6,6*SIZE(A) MADD t11,t11,a2,b2 @@ -2358,23 +2358,23 @@ .L85: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop -.L86: +.L86: LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 - + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 - + LD b0,0(B) LD a0,0*SIZE(A) MADD t11,t11,a4,b4 @@ -2382,16 +2382,16 @@ MADD t21,t21,a5,b4 - + .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L89 + beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -2401,7 +2401,7 @@ .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA @@ -2410,7 +2410,7 @@ ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) - + daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else @@ -2445,10 +2445,10 @@ .align 3 .L11_M1: andi M,MCO,1 # mr = 1 - beqz M,.L999 + beqz M,.L999 nop -.L90: +.L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2478,7 +2478,7 @@ move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) - dsra K,KCO,2 + dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif @@ -2487,7 +2487,7 @@ LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - + LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 @@ -2495,28 +2495,28 @@ LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 - + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 - + daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop -.L96: +.L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 @@ -2526,14 +2526,14 @@ LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 - + .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L99 + beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S index 595eb9620..a095e0541 100644 --- a/kernel/mips64/dnrm2.S +++ b/kernel/mips64/dnrm2.S @@ -43,7 +43,7 @@ #define X $5 #define INCX $6 #define XX $7 - + #define I $2 #define TEMP $3 @@ -71,7 +71,7 @@ PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S index 6220b6ac9..cb6fbe99c 100644 --- a/kernel/mips64/dot.S +++ b/kernel/mips64/dot.S @@ -44,7 +44,7 @@ #define INCX $6 #define Y $7 #define INCY $8 - + #define I $2 #define TEMP $3 @@ -61,7 +61,7 @@ #define s2 $f1 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) @@ -306,5 +306,5 @@ #endif j $31 NOP - + EPILOGUE diff --git a/kernel/mips64/gemm_beta.S b/kernel/mips64/gemm_beta.S index 2e0b24171..648d1b831 100644 --- a/kernel/mips64/gemm_beta.S +++ b/kernel/mips64/gemm_beta.S @@ -62,7 +62,7 @@ #define ALPHA $f15 PROLOGUE - + LDARG C, 0($sp) MTC $0, FZERO LDARG LDC, 8($sp) diff --git a/kernel/mips64/gemm_kernel.S b/kernel/mips64/gemm_kernel.S index 8ee32d529..14057118a 100644 --- a/kernel/mips64/gemm_kernel.S +++ b/kernel/mips64/gemm_kernel.S @@ -55,7 +55,7 @@ #define L $7 #define PREFETCHSIZE (4 * 10) - + #define CO1 $14 #define CO2 $15 #define CO3 $16 @@ -109,7 +109,7 @@ #define ALPHA $f15 PROLOGUE - + daddiu $sp, $sp, -160 SDARG $16, 0($sp) @@ -1028,7 +1028,7 @@ bgtz J, .L10 move B, BO .align 3 - + .L30: andi J, N, 4 blez J, .L50 diff --git a/kernel/mips64/gemv_n.S b/kernel/mips64/gemv_n.S index 908f97347..dd0b6066c 100644 --- a/kernel/mips64/gemv_n.S +++ b/kernel/mips64/gemv_n.S @@ -89,7 +89,7 @@ PROLOGUE - + LDARG Y, 0($sp) LDARG INCY, 8($sp) LDARG BUFFER, 16($sp) @@ -109,7 +109,7 @@ sdc1 $f21, 24($sp) sdc1 $f22, 32($sp) #endif - + blez M, .L999 dsll INCX, INCX, BASE_SHIFT diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c index 7db595449..d06b58f97 100644 --- a/kernel/mips64/gemv_n_loongson3a.c +++ b/kernel/mips64/gemv_n_loongson3a.c @@ -1,6 +1,6 @@ #include "common.h" -//These are auto-tuning codes on Loongson-3A platform. +//These are auto-tuning codes on Loongson-3A platform. //#define prefetch(x) __builtin_prefetch(x) //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) @@ -13,7 +13,7 @@ #define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) #define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) -int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { BLASLONG kx=0, ky=0; diff --git a/kernel/mips64/gemv_t.S b/kernel/mips64/gemv_t.S index 2808756d4..ae00feaf7 100644 --- a/kernel/mips64/gemv_t.S +++ b/kernel/mips64/gemv_t.S @@ -85,7 +85,7 @@ #define x8 $f20 PROLOGUE - + LDARG Y, 0($sp) LDARG INCY, 8($sp) LDARG BUFFER, 16($sp) @@ -104,7 +104,7 @@ #ifndef __64BIT__ sdc1 $f20, 16($sp) #endif - + blez M, .L999 dsll INCX, INCX, BASE_SHIFT @@ -353,9 +353,9 @@ .L19: LD a1, 0 * SIZE(Y) - daddu Y, Y, INCY + daddu Y, Y, INCY LD a2, 0 * SIZE(Y) - daddu Y, Y, INCY + daddu Y, Y, INCY MADD a1, a1, ALPHA, y1 daddiu J, J, -1 @@ -363,11 +363,11 @@ MTC $0, y1 ST a1, 0 * SIZE(YY) - daddu YY, YY, INCY + daddu YY, YY, INCY ST a2, 0 * SIZE(YY) bgtz J, .L11 - daddu YY, YY, INCY + daddu YY, YY, INCY .align 3 .L20: @@ -504,13 +504,13 @@ .L29: LD a1, 0 * SIZE(Y) - daddu Y, Y, INCY + daddu Y, Y, INCY MADD a1, a1, ALPHA, y1 NOP ST a1, 0 * SIZE(YY) - daddu YY, YY, INCY + daddu YY, YY, INCY .align 3 .L999: diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c index 51f035d8e..a6b4154f6 100644 --- a/kernel/mips64/gemv_t_loongson3a.c +++ b/kernel/mips64/gemv_t_loongson3a.c @@ -1,6 +1,6 @@ #include "common.h" -//These are auto-tuning codes on Loongson-3A platform. +//These are auto-tuning codes on Loongson-3A platform. //#define prefetch(x) __builtin_prefetch(x) //#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) diff --git a/kernel/mips64/iamax.S b/kernel/mips64/iamax.S index ff6c2157e..61e3514f4 100644 --- a/kernel/mips64/iamax.S +++ b/kernel/mips64/iamax.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $3 #define TEMP $7 @@ -69,9 +69,9 @@ #define x2 $8 #define x3 $9 #define x4 $10 - + PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/iamin.S b/kernel/mips64/iamin.S index 131aa881b..ff05b9981 100644 --- a/kernel/mips64/iamin.S +++ b/kernel/mips64/iamin.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $3 #define TEMP $7 @@ -69,9 +69,9 @@ #define x2 $8 #define x3 $9 #define x4 $10 - + PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/imax.S b/kernel/mips64/imax.S index ec9d3fcdf..e0d358bca 100644 --- a/kernel/mips64/imax.S +++ b/kernel/mips64/imax.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $3 #define TEMP $7 @@ -69,9 +69,9 @@ #define x2 $8 #define x3 $9 #define x4 $10 - + PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/imin.S b/kernel/mips64/imin.S index a247c833c..b41f7661c 100644 --- a/kernel/mips64/imin.S +++ b/kernel/mips64/imin.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $3 #define TEMP $7 @@ -69,9 +69,9 @@ #define x2 $8 #define x3 $9 #define x4 $10 - + PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/izamax.S b/kernel/mips64/izamax.S index 12e26c9e1..c7c8a5b52 100644 --- a/kernel/mips64/izamax.S +++ b/kernel/mips64/izamax.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $3 #define TEMP $7 @@ -73,9 +73,9 @@ #define x2 $8 #define x3 $9 #define x4 $10 - + PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) @@ -92,7 +92,7 @@ FABS t1, a1 FABS t2, a2 - + ADD s1, t1, t2 ADD s2, t1, t2 ADD s3, t1, t2 diff --git a/kernel/mips64/izamin.S b/kernel/mips64/izamin.S index af3d75056..e65ac8543 100644 --- a/kernel/mips64/izamin.S +++ b/kernel/mips64/izamin.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $3 #define TEMP $7 @@ -73,9 +73,9 @@ #define x2 $8 #define x3 $9 #define x4 $10 - + PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) @@ -92,7 +92,7 @@ FABS t1, a1 FABS t2, a2 - + ADD s1, t1, t2 ADD s2, t1, t2 ADD s3, t1, t2 diff --git a/kernel/mips64/max.S b/kernel/mips64/max.S index a432f1225..0616c92ce 100644 --- a/kernel/mips64/max.S +++ b/kernel/mips64/max.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -61,7 +61,7 @@ #define s4 $f3 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/min.S b/kernel/mips64/min.S index 33cfc81f3..cf2e24b42 100644 --- a/kernel/mips64/min.S +++ b/kernel/mips64/min.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -61,7 +61,7 @@ #define s4 $f3 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/rot.S b/kernel/mips64/rot.S index b94a59c98..c72d3812e 100644 --- a/kernel/mips64/rot.S +++ b/kernel/mips64/rot.S @@ -44,7 +44,7 @@ #define INCX $6 #define Y $7 #define INCY $8 - + #define XX $9 #define YY $10 @@ -70,7 +70,7 @@ #define t4 $f3 PROLOGUE - + dsll INCX, INCX, BASE_SHIFT li TEMP, SIZE diff --git a/kernel/mips64/scal.S b/kernel/mips64/scal.S index f544914d5..b28b8a309 100644 --- a/kernel/mips64/scal.S +++ b/kernel/mips64/scal.S @@ -66,7 +66,7 @@ #define t4 $f11 PROLOGUE - + li TEMP, SIZE MTC $0, a1 @@ -166,7 +166,7 @@ NOP .align 3 -.L50: +.L50: bne INCX, TEMP, .L60 dsra I, N, 3 @@ -397,7 +397,7 @@ LD a1, 0 * SIZE(X) MUL t1, ALPHA, a1 - + daddiu I, I, -1 ST t1, 0 * SIZE(X) diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index bc81d0eb5..37b20a880 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -80,7 +80,7 @@ #define F27 27 #define F26 26 #define F25 25 -#define F24 24 +#define F24 24 #define F23 23 #define F22 22 #define F21 21 @@ -88,7 +88,7 @@ #define F19 19 #define F18 18 #define F17 17 -#define F16 16 +#define F16 16 #define F15 15 #define F14 14 #define F13 13 @@ -100,10 +100,10 @@ #define F7 7 #define F6 6 #define F5 5 -#define F4 4 -#define F3 3 -#define F2 2 -#define F1 1 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 #define F0 0 #define R12 12 @@ -132,7 +132,7 @@ # .fmask 0x00000000,0 # .set noreorder # .set nomacro - + PROLOGUE @@ -213,12 +213,12 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 - + MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 @@ -235,13 +235,13 @@ FETCH $0, 0 * SIZE(CO1) MOV C24, C11 FETCH $0, 4 * SIZE(CO1) - + MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 FETCH $0, 4 * SIZE(CO2) - - daddu PREB, B, PREB + + daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) @@ -271,12 +271,12 @@ dsra L, K, 6 # UnRoll K=64 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 - + MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 @@ -293,13 +293,13 @@ FETCH $0, 0 * SIZE(CO1) MOV C24, C11 FETCH $0, 4 * SIZE(CO1) - + MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 FETCH $0, 4 * SIZE(CO2) - - daddu PREB, B, PREB + + daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) @@ -435,7 +435,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -571,7 +571,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -707,7 +707,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -843,7 +843,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -979,7 +979,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -1115,7 +1115,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -1251,7 +1251,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -1387,7 +1387,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -1523,7 +1523,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -1659,7 +1659,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -1795,7 +1795,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -1931,7 +1931,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -2067,7 +2067,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -2203,7 +2203,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -2339,7 +2339,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -2475,7 +2475,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -2622,7 +2622,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -2758,7 +2758,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -2894,7 +2894,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3030,7 +3030,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3166,7 +3166,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3302,7 +3302,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3438,7 +3438,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3574,7 +3574,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3721,7 +3721,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3857,7 +3857,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -3993,7 +3993,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -4129,7 +4129,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -4148,7 +4148,7 @@ .align 4 -.L484: +.L484: #ifndef TRMMKERNEL andi L, K, 8 #else @@ -4276,7 +4276,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -4412,7 +4412,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -4559,7 +4559,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -4640,7 +4640,7 @@ MADPS C13, C13, A5, B7 MADPS C23, C23, A6, B7 - daddiu PREB, PREB, 8 * SIZE + daddiu PREB, PREB, 8 * SIZE MADPS C33, C33, A7, B7 MADPS C43, C43, A8, B7 @@ -4721,7 +4721,7 @@ CVTU A8, C41 # A8=C41.upper=c28 LD B6, 5 * SIZE(CO2) - MADD A1, B1, A1, ALPHA # c12 + MADD A1, B1, A1, ALPHA # c12 LD B7, 7 * SIZE(CO1) MADD A2, B2, A2, ALPHA # c22 @@ -4732,7 +4732,7 @@ MADD A4, B4, A4, ALPHA # c24 LD B3, 0 * SIZE(CO2) - + MADD A5, B5, A5, ALPHA # c16 LD B4, 2 * SIZE(CO1) @@ -4759,7 +4759,7 @@ MADD C31, B6, C31, ALPHA # c16 LD A2, 6 * SIZE(CO2) - + MADD C33, B7, C33, ALPHA # c26 ST A4, 3 * SIZE(CO2) @@ -4773,7 +4773,7 @@ MADD C43, A2, C43, ALPHA # c28 ST C13, 0 * SIZE(CO2) - + ST C21, 2 * SIZE(CO1) ST C23, 2 * SIZE(CO2) ST C31, 4 * SIZE(CO1) @@ -4801,58 +4801,58 @@ CVTU A7, C44 # B7=C42.upper=c48 LD B6, 5 * SIZE(CO4) - CVTU A8, C42 # A1=C44.upper=c38 + CVTU A8, C42 # A1=C44.upper=c38 LD B7, 7 * SIZE(CO3) MADD A1, B1, A1, ALPHA # c31 LD C11, 7 * SIZE(CO4) - MADD A2, B2, A2, ALPHA + MADD A2, B2, A2, ALPHA LD C13, 0 * SIZE(CO3) - - MADD A3, B3, A3, ALPHA + + MADD A3, B3, A3, ALPHA LD C21, 0 * SIZE(CO4) - - MADD A4, B4, A4, ALPHA + + MADD A4, B4, A4, ALPHA LD C23, 2 * SIZE(CO3) - MADD A5, B5, A5, ALPHA + MADD A5, B5, A5, ALPHA LD C31, 2 * SIZE(CO4) - - MADD A6, B6, A6, ALPHA + + MADD A6, B6, A6, ALPHA LD C33, 4 * SIZE(CO3) - - MADD A7, B7, A7, ALPHA + + MADD A7, B7, A7, ALPHA LD C41, 4 * SIZE(CO4) - - MADD A8, C11, A8, ALPHA + + MADD A8, C11, A8, ALPHA ST A1, 1 * SIZE(CO3) - MADD C12, C13, C12, ALPHA + MADD C12, C13, C12, ALPHA LD C43, 6 * SIZE(CO3) - - MADD C14, C21, C14, ALPHA + + MADD C14, C21, C14, ALPHA ST A2, 1 * SIZE(CO4) - MADD C22, C23, C22, ALPHA + MADD C22, C23, C22, ALPHA LD B1, 6 * SIZE(CO4) - - MADD C24, C31, C24, ALPHA + + MADD C24, C31, C24, ALPHA ST A3, 3 * SIZE(CO3) - MADD C32, C33, C32, ALPHA + MADD C32, C33, C32, ALPHA ST A4, 3 * SIZE(CO4) - MADD C34, C41, C34, ALPHA + MADD C34, C41, C34, ALPHA ST A5, 5 * SIZE(CO3) - MADD C42, C43, C42, ALPHA + MADD C42, C43, C42, ALPHA ST A6, 5 * SIZE(CO4) ST A7, 7 * SIZE(CO3) NOP - MADD C44, B1, C44, ALPHA + MADD C44, B1, C44, ALPHA ST A8, 7 * SIZE(CO4) ST C12, 0 * SIZE(CO3) @@ -4880,7 +4880,7 @@ CVTU A7, C43 # A7=C43.upper=c18 CVTU A8, C41 # A8=C41.upper=c28 - MUL A1, A1, ALPHA # c12 + MUL A1, A1, ALPHA # c12 MUL A2, A2, ALPHA # c22 MUL A3, A3, ALPHA # c14 MUL A4, A4, ALPHA # c24 @@ -4903,7 +4903,7 @@ MUL C31, C31, ALPHA # c16 ST A5, 5 * SIZE(CO1) - + MUL C33, C33, ALPHA # c26 ST A6, 5 * SIZE(CO2) @@ -4921,7 +4921,7 @@ CVTU A3, C24 # B3=C22.upper=c44 ST C21, 2 * SIZE(CO1) - + CVTU A4, C22 # B4=C24.upper=c34 ST C23, 2 * SIZE(CO2) @@ -4934,40 +4934,40 @@ CVTU A7, C44 # B7=C42.upper=c48 ST C41, 6 * SIZE(CO1) - CVTU A8, C42 # A1=C44.upper=c38 + CVTU A8, C42 # A1=C44.upper=c38 ST C43, 6 * SIZE(CO2) MUL A1, A1, ALPHA # c31 - MUL A2, A2, ALPHA - MUL A3, A3, ALPHA - MUL A4, A4, ALPHA - MUL A5, A5, ALPHA - MUL A6, A6, ALPHA - MUL A7, A7, ALPHA - MUL A8, A8, ALPHA - - MUL C12, C12, ALPHA + MUL A2, A2, ALPHA + MUL A3, A3, ALPHA + MUL A4, A4, ALPHA + MUL A5, A5, ALPHA + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA + + MUL C12, C12, ALPHA ST A1, 1 * SIZE(CO3) - MUL C14, C14, ALPHA + MUL C14, C14, ALPHA ST A2, 1 * SIZE(CO4) - MUL C22, C22, ALPHA + MUL C22, C22, ALPHA ST A3, 3 * SIZE(CO3) - MUL C24, C24, ALPHA + MUL C24, C24, ALPHA ST A4, 3 * SIZE(CO4) - MUL C32, C32, ALPHA + MUL C32, C32, ALPHA ST A5, 5 * SIZE(CO3) - MUL C34, C34, ALPHA + MUL C34, C34, ALPHA ST A6, 5 * SIZE(CO4) - MUL C42, C42, ALPHA + MUL C42, C42, ALPHA ST A7, 7 * SIZE(CO3) - MUL C44, C44, ALPHA + MUL C44, C44, ALPHA ST A8, 7 * SIZE(CO4) ST C12, 0 * SIZE(CO3) @@ -5025,12 +5025,12 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 - + MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 @@ -5045,12 +5045,12 @@ MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 - + MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 - - daddu PREB, B, PREB + + daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) @@ -5077,12 +5077,12 @@ dsra L, K, 2 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 dsll PREB, K, BASE_SHIFT MOV C21, C11 MOV C22, C11 - + MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 @@ -5097,12 +5097,12 @@ MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 - + MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 - - daddu PREB, B, PREB + + daddu PREB, B, PREB MOV C43, C11 FETCH $0, 0 * SIZE(CO3) @@ -5114,7 +5114,7 @@ PLU B4, B2, B2 #endif -.L4410: # +.L4410: # daddiu L, L, -1 MADPS C11, C11, A1, B1 gsLQC1(R13, F13, F12, 1) # B3 B4 @@ -5196,7 +5196,7 @@ MADPS C13, C13, A7, B7 daddiu PREA, PREA, 16 * SIZE MADPS C23, C23, A8, B7 - daddiu PREB, PREB, 16 * SIZE + daddiu PREB, PREB, 16 * SIZE MADPS C14, C14, A7, B8 MADPS C24, C24, A8, B8 @@ -5303,7 +5303,7 @@ LD B4, 3 * SIZE(CO2) - MADD A1, B1, A1, ALPHA # c12 + MADD A1, B1, A1, ALPHA # c12 LD B5, 0 * SIZE(CO1) MADD A2, B2, A2, ALPHA # c22 @@ -5314,7 +5314,7 @@ MADD A4, B4, A4, ALPHA # c24 LD B1, 2 * SIZE(CO2) - + MADD C11, B5, C11, ALPHA # c12 ST A1, 1 * SIZE(CO1) @@ -5347,25 +5347,25 @@ MADD A1, B1, A1, ALPHA # c31 LD A5, 0 * SIZE(CO3) - MADD A2, B2, A2, ALPHA + MADD A2, B2, A2, ALPHA LD A6, 0 * SIZE(CO4) - - MADD A3, B3, A3, ALPHA + + MADD A3, B3, A3, ALPHA LD A7, 2 * SIZE(CO3) - - MADD A4, B4, A4, ALPHA + + MADD A4, B4, A4, ALPHA LD A8, 2 * SIZE(CO4) - MADD C12, A5, C12, ALPHA + MADD C12, A5, C12, ALPHA ST A1, 1 * SIZE(CO3) - MADD C14, A6, C14, ALPHA + MADD C14, A6, C14, ALPHA ST A2, 1 * SIZE(CO4) - MADD C22, A7, C22, ALPHA + MADD C22, A7, C22, ALPHA ST A3, 3 * SIZE(CO3) - - MADD C24, A8, C24, ALPHA + + MADD C24, A8, C24, ALPHA ST A4, 3 * SIZE(CO4) ST C12, 0 * SIZE(CO3) @@ -5384,11 +5384,11 @@ CVTU A3, C23 # A3=C23.upper=c14 CVTU A4, C21 # A4=C21.upper=c24 - MUL A1, A1, ALPHA # c12 + MUL A1, A1, ALPHA # c12 MUL A2, A2, ALPHA # c22 MUL A3, A3, ALPHA # c14 MUL A4, A4, ALPHA # c24 - + MUL C11, C11, ALPHA # c12 ST A1, 1 * SIZE(CO1) @@ -5409,25 +5409,25 @@ CVTU A7, C24 # B3=C22.upper=c44 ST C21, 2 * SIZE(CO1) - + CVTU A8, C22 # B4=C24.upper=c34 ST C23, 2 * SIZE(CO2) MUL A5, A5, ALPHA # c31 - MUL A6, A6, ALPHA - MUL A7, A7, ALPHA - MUL A8, A8, ALPHA + MUL A6, A6, ALPHA + MUL A7, A7, ALPHA + MUL A8, A8, ALPHA - MUL C12, C12, ALPHA + MUL C12, C12, ALPHA ST A5, 1 * SIZE(CO3) - MUL C14, C14, ALPHA + MUL C14, C14, ALPHA ST A6, 1 * SIZE(CO4) - MUL C22, C22, ALPHA + MUL C22, C22, ALPHA ST A7, 3 * SIZE(CO3) - - MUL C24, C24, ALPHA + + MUL C24, C24, ALPHA ST A8, 3 * SIZE(CO4) ST C12, 0 * SIZE(CO3) @@ -5478,11 +5478,11 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 MOV C21, C11 MOV C22, C11 - + MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 @@ -5497,11 +5497,11 @@ MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 - + MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 - + MOV C43, C11 FETCH $0, 0 * SIZE(CO3) @@ -5527,11 +5527,11 @@ dsra L, K, 2 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 MOV C21, C11 MOV C22, C11 - + MOV C31, C11 MOV C32, C11 gsLQC1(R13, F9, F8, 0) # B1 B2 @@ -5546,11 +5546,11 @@ MOV C23, C11 FETCH $0, 0 * SIZE(CO1) MOV C24, C11 - + MOV C33, C11 FETCH $0, 0 * SIZE(CO2) MOV C34, C11 - + MOV C43, C11 FETCH $0, 0 * SIZE(CO3) @@ -5669,7 +5669,7 @@ CVTU A2, C11 # A2=C11.upper=c22 LD B2, 1 * SIZE(CO2) - MADD A1, B1, A1, ALPHA # c12 + MADD A1, B1, A1, ALPHA # c12 LD B5, 0 * SIZE(CO1) MADD A2, B2, A2, ALPHA # c22 @@ -5693,13 +5693,13 @@ MADD A1, B1, A1, ALPHA # c31 LD A5, 0 * SIZE(CO3) - MADD A2, B2, A2, ALPHA + MADD A2, B2, A2, ALPHA LD A6, 0 * SIZE(CO4) - - MADD C12, A5, C12, ALPHA + + MADD C12, A5, C12, ALPHA ST A1, 1 * SIZE(CO3) - MADD C14, A6, C14, ALPHA + MADD C14, A6, C14, ALPHA ST A2, 1 * SIZE(CO4) ST C12, 0 * SIZE(CO3) @@ -5713,7 +5713,7 @@ CVTU A1, C13 # A1=C13.upper=c12 CVTU A2, C11 # A2=C11.upper=c22 - MUL A1, A1, ALPHA # c12 + MUL A1, A1, ALPHA # c12 MUL A2, A2, ALPHA # c22 MUL C11, C11, ALPHA # c12 @@ -5725,13 +5725,13 @@ MUL A3, A3, ALPHA # c31 ST A1, 1 * SIZE(CO1) - MUL A4, A4, ALPHA + MUL A4, A4, ALPHA ST A2, 1 * SIZE(CO2) - MUL C12, C12, ALPHA + MUL C12, C12, ALPHA ST C11, 0 * SIZE(CO1) - - MUL C14, C14, ALPHA + + MUL C14, C14, ALPHA ST C13, 0 * SIZE(CO2) ST A3, 1 * SIZE(CO3) @@ -5784,7 +5784,7 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD B1, 0 * SIZE(BO) MOV C21, C11 @@ -5805,10 +5805,10 @@ MOV C23, C11 MOV C24, C11 - + MOV C33, C11 MOV C34, C11 - + MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA))||\ @@ -5827,7 +5827,7 @@ dsra L, K, 2 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD B1, 0 * SIZE(BO) MOV C21, C11 @@ -5848,10 +5848,10 @@ MOV C23, C11 MOV C24, C11 - + MOV C33, C11 MOV C34, C11 - + MOV C43, C11 blez L, .L412 MOV C44, C11 @@ -5860,7 +5860,7 @@ .L4110: daddiu L, L, -1 LD A2, 1 * SIZE(AO) - + MADD C11, C11, A1, B1 LD B5, 4 * SIZE(BO) @@ -5875,7 +5875,7 @@ LD A3, 2 * SIZE(AO) NOP - + MADD C11, C11, A2, B5 LD B1, 8 * SIZE(BO) @@ -5890,7 +5890,7 @@ LD A4, 3 * SIZE(AO) daddiu AO, AO, 4 * SIZE - + MADD C11, C11, A3, B1 LD B5, 12 * SIZE(BO) @@ -5930,7 +5930,7 @@ LD A2, 1 * SIZE(AO) daddiu AO, AO, 2 * SIZE - + MADD C11, C11, A1, B1 LD B5, 4 * SIZE(BO) @@ -5945,7 +5945,7 @@ LD A1, 0 * SIZE(AO) daddiu BO, BO, 8 * SIZE - + MADD C11, C11, A2, B5 LD B1, 0 * SIZE(BO) @@ -6046,7 +6046,7 @@ .align 4 .L2: # Nr=2 - andi J, N, 2 + andi J, N, 2 blez J, .L1 NOP @@ -6078,7 +6078,7 @@ MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) - MOV C12, C11 + MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 @@ -6107,10 +6107,10 @@ MOV C23, C11 MOV C24, C11 - + MOV C33, C11 MOV C34, C11 - + MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -6131,7 +6131,7 @@ MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) - MOV C12, C11 + MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 @@ -6160,10 +6160,10 @@ MOV C23, C11 MOV C24, C11 - + MOV C33, C11 MOV C34, C11 - + MOV C43, C11 blez L, .L282 MOV C44, C11 @@ -6293,8 +6293,8 @@ LD A8, 7 * SIZE(CO1) MADD A1, A1, C11, ALPHA - LD B1, 0 * SIZE(CO2) - + LD B1, 0 * SIZE(CO2) + MADD A2, A2, C21, ALPHA LD B2, 1 * SIZE(CO2) @@ -6439,7 +6439,7 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -6464,7 +6464,7 @@ MOV C33, C11 MOV C34, C11 - + MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -6483,7 +6483,7 @@ dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -6508,7 +6508,7 @@ MOV C33, C11 MOV C34, C11 - + MOV C43, C11 blez L, .L242 MOV C44, C11 @@ -6593,8 +6593,8 @@ LD A4, 3 * SIZE(CO1) MADD A1, A1, C11, ALPHA - LD B1, 0 * SIZE(CO2) - + LD B1, 0 * SIZE(CO2) + MADD A2, A2, C21, ALPHA LD B2, 1 * SIZE(CO2) @@ -6687,7 +6687,7 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -6720,7 +6720,7 @@ dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -6797,8 +6797,8 @@ LD A2, 1 * SIZE(CO1) MADD A1, A1, C11, ALPHA - LD B1, 0 * SIZE(CO2) - + LD B1, 0 * SIZE(CO2) + MADD A2, A2, C21, ALPHA LD B2, 1 * SIZE(CO2) @@ -6867,7 +6867,7 @@ #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -6899,7 +6899,7 @@ dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -6963,8 +6963,8 @@ LD A1, 0 * SIZE(CO1) MADD A1, A1, C11, ALPHA - LD B1, 0 * SIZE(CO2) - + LD B1, 0 * SIZE(CO2) + MADD B1, B1, C12, ALPHA ST A1, 0 * SIZE(CO1) @@ -7044,7 +7044,7 @@ MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) - MOV C12, C11 + MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 @@ -7072,10 +7072,10 @@ MOV C23, C11 MOV C24, C11 - + MOV C33, C11 MOV C34, C11 - + MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -7096,7 +7096,7 @@ MTC $0, C11 # CLEAR REAULTS REGISTERS LD A1, 0 * SIZE(AO) - MOV C12, C11 + MOV C12, C11 LD A2, 1 * SIZE(AO) MOV C21, C11 @@ -7124,10 +7124,10 @@ MOV C23, C11 MOV C24, C11 - + MOV C33, C11 MOV C34, C11 - + MOV C43, C11 blez L, .L182 MOV C44, C11 @@ -7315,7 +7315,7 @@ daddu BO, B, TEMP #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -7339,7 +7339,7 @@ MOV C33, C11 MOV C34, C11 - + MOV C43, C11 MOV C44, C11 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -7358,7 +7358,7 @@ dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -7382,7 +7382,7 @@ MOV C33, C11 MOV C34, C11 - + MOV C43, C11 blez L, .L142 MOV C44, C11 @@ -7511,7 +7511,7 @@ #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -7544,7 +7544,7 @@ dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -7660,7 +7660,7 @@ daddu BO, B, L #endif MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -7686,7 +7686,7 @@ dsra L, K, 1 # UnRoll K=4 MTC $0, C11 # CLEAR REAULTS REGISTERS - MOV C12, C11 + MOV C12, C11 LD A1, 0 * SIZE(AO) MOV C21, C11 @@ -7739,13 +7739,13 @@ LD A1, 0 * SIZE(C) MADD A1, A1, C11, ALPHA - + ST A1, 0 * SIZE(C) daddiu C, C, 1 * SIZE #else MUL A1, C11, ALPHA - + ST A1, 0 * SIZE(C) daddiu C, C, 1 * SIZE diff --git a/kernel/mips64/sgemm_kernel_loongson3a_4x4.S b/kernel/mips64/sgemm_kernel_loongson3a_4x4.S index 4a8c9b0e4..10c5f47de 100644 --- a/kernel/mips64/sgemm_kernel_loongson3a_4x4.S +++ b/kernel/mips64/sgemm_kernel_loongson3a_4x4.S @@ -110,7 +110,7 @@ #define F27 27 #define F26 26 #define F25 25 -#define F24 24 +#define F24 24 #define F23 23 #define F22 22 #define F21 21 @@ -118,7 +118,7 @@ #define F19 19 #define F18 18 #define F17 17 -#define F16 16 +#define F16 16 #define F15 15 #define F14 14 #define F13 13 @@ -130,14 +130,14 @@ #define F7 7 #define F6 6 #define F5 5 -#define F4 4 -#define F3 3 -#define F2 2 -#define F1 1 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 #define F0 0 PROLOGUE - + daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) @@ -160,7 +160,7 @@ ST $f23,144($sp) - .align 5 + .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M @@ -170,26 +170,26 @@ move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 - + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 - + #if defined(TRMMKERNEL) - LDARG OFFSET,160($sp) # OFFSET is relate to the data part + LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) - neg KK,OFFSET + neg KK,OFFSET #endif - + move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 - move CO1,C + move CO1,C dsra M,MCO,2 # M=MCO/2 - + move A,AO # Reset A daddu CO2,C,LDC @@ -200,7 +200,7 @@ daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) - move KK,OFFSET + move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj @@ -227,18 +227,18 @@ MOV t41,t11 MOV t12,t11 LD b0,0(B) - + MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 LD a2,2*SIZE(A) - + MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) - + MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) @@ -250,7 +250,7 @@ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) - daddiu TEMP, KK, 4 # S=L,U=L + daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif @@ -259,7 +259,7 @@ beqz K,.L15 MOV t44,t11 -#else +#else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) @@ -271,7 +271,7 @@ MOV t41,t11 MOV t12,t11 LD b0,0(B) - + MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) @@ -279,11 +279,11 @@ MOV t42,t11 dsra K,KCO,2 # K=KCO/2 LD a2,2*SIZE(A) - + MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) - + MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) @@ -296,7 +296,7 @@ beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif - + .align 5 .L11: # kr=4 MADD t11,t11,a0,b0 @@ -306,29 +306,29 @@ MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) - + MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) - + MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,(PREB) - + MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 LD b6,6*SIZE(B) FETCH $0,(PREA) - + MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 LD a7,7*SIZE(A) - + MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,7*SIZE(B) @@ -447,14 +447,14 @@ .L15: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop -.L16: +.L16: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) @@ -528,16 +528,16 @@ daddu PREB,PREB,8*SIZE LD b3,3*SIZE(B) - + .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L19 + beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -569,8 +569,8 @@ MADD t44,t44,a3,b3 .L19: # Write Back to C -#ifndef TRMMKERNEL - LD c11,0(CO1) # GEMM write part +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -640,11 +640,11 @@ daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB - - bnez M,.L10 + + bnez M,.L10 daddu CO4,CO4,4*SIZE -#else +#else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 @@ -685,7 +685,7 @@ daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE - daddiu CO4,CO4, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) @@ -698,7 +698,7 @@ FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - dsubu TEMP,KCO,KK + dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else @@ -710,10 +710,10 @@ daddu B,B,TEMP # mov B to the end of panel Bj #endif -#ifdef LEFT +#ifdef LEFT daddiu KK, KK,4 #endif - bnez M,.L10 + bnez M,.L10 nop #endif @@ -721,7 +721,7 @@ .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 - beqz M,.L14_M1 + beqz M,.L14_M1 nop .L20: @@ -729,7 +729,7 @@ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else - dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP @@ -738,7 +738,7 @@ LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) - + MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 @@ -764,18 +764,18 @@ MOV t24,t11 # clear 2*4=8 results registers #else - move B,BO # Reset B + move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) - + MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 - dsra K,KCO,2 + dsra K,KCO,2 LD b2,2*SIZE(B) MOV t13,t11 @@ -806,7 +806,7 @@ MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 - + MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 @@ -866,7 +866,7 @@ MADD t24,t24,a7,b7 -.L25: +.L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else @@ -875,7 +875,7 @@ beqz K,.L28 nop -.L26: +.L26: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 @@ -890,7 +890,7 @@ LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,4*SIZE # 2mr*2kr @@ -915,16 +915,16 @@ MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 - -.L28: # kr=1 + +.L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L29 + beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr @@ -942,11 +942,11 @@ .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) @@ -985,25 +985,25 @@ #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 - + ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 - + ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 - + ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) - + daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE @@ -1036,7 +1036,7 @@ .align 3 .L14_M1: - andi M,MCO,1 # mr=1 + andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop @@ -1056,13 +1056,13 @@ MTC $0,t11 LD b0,0*SIZE(B) - + MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) - + MOV t14,t11 LD b3,3*SIZE(B) @@ -1077,35 +1077,35 @@ nop beqz K,.L35 nop - -#else + +#else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) - + MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) - + MOV t14,t11 beqz K,.L35 LD b3,3*SIZE(B) #endif -.L31: # nr=4,mr=1,kr=4 +.L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 - + LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 - + LD b6,6*SIZE(B) LD b7,7*SIZE(B) MADD t13,t13,a0,b2 @@ -1113,11 +1113,11 @@ LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 - + LD b0,8*SIZE(B) LD b1,9*SIZE(B) MADD t12,t12,a1,b5 - + LD b2,10*SIZE(B) LD b3,11*SIZE(B) MADD t13,t13,a1,b6 @@ -1126,12 +1126,12 @@ LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 daddiu K,K,-1 - + LD b4,12*SIZE(B) LD b5,13*SIZE(B) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr - + LD b6,14*SIZE(B) LD b7,15*SIZE(B) MADD t13,t13,a2,b2 @@ -1140,7 +1140,7 @@ LD a0, 0*SIZE(A) # a0 daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a3,b4 - + LD b0,0*SIZE(B) MADD t12,t12,a3,b5 LD b1,1*SIZE(B) @@ -1154,14 +1154,14 @@ .L35: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop -.L36: +.L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 @@ -1169,10 +1169,10 @@ LD b5,5*SIZE(B) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr - + LD b6,6*SIZE(B) MADD t13,t13,a0,b2 - + LD b7,7*SIZE(B) MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr @@ -1181,41 +1181,41 @@ .L37: LD a0,0(A) MADD t11,t11,a1,b4 - + LD b0,0*SIZE(B) LD b1,1*SIZE(B) MADD t12,t12,a1,b5 - + LD b2,2*SIZE(B) LD b3,3*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 - - + + .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L39 + beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 - daddu A,A,1*SIZE + daddu A,A,1*SIZE daddu B,B,4*SIZE - + MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) + LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA @@ -1261,22 +1261,22 @@ .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) - daddiu KK, KK,4 + daddiu KK, KK,4 #endif - bnez N,.L0_N4_Lb + bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj - .align 5 + .align 5 .L0_N2: andi N,NCO,2 # nr = 2 - beqz N,.L0_N1 + beqz N,.L0_N1 nop .L0_N2_Lb: - move CO1,C + move CO1,C daddu CO2,C,LDC - dsra M,MCO,2 + dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA @@ -1288,13 +1288,13 @@ beqz M,.L12_M2 nop -.L40: +.L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT - dsll TEMP, KK,1 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP @@ -1311,10 +1311,10 @@ MOV t41,t11 LD a2,2*SIZE(A) LD a3,3*SIZE(A) - + MOV t12,t11 MOV t22,t11 - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) @@ -1322,7 +1322,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 @@ -1342,10 +1342,10 @@ LD a2,2*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a3,3*SIZE(A) - + MOV t12,t11 MOV t22,t11 - + MOV t32,t11 beqz K,.L45 MOV t42,t11 @@ -1411,9 +1411,9 @@ FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 - + daddu A,A,16*SIZE # 4mr*4kr - daddu B,B,8*SIZE # 2nr*4kr + daddu B,B,8*SIZE # 2nr*4kr .L44: MADD t11,t11,a4,b6 @@ -1443,14 +1443,14 @@ .L45: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop -.L46: +.L46: MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 @@ -1469,7 +1469,7 @@ FETCH $0,0(PREA) MADD t32,t32,a2,b1 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 - + MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE @@ -1495,16 +1495,16 @@ daddu PREA,PREA,8*SIZE - + .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L49 + beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -1524,7 +1524,7 @@ .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -1545,7 +1545,7 @@ MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA - daddiu M,M,-1 + daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) @@ -1557,8 +1557,8 @@ FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) - daddu CO1,CO1,4*SIZE - bnez M,.L40 + daddu CO1,CO1,4*SIZE + bnez M,.L40 daddu CO2,CO2,4*SIZE #else @@ -1566,7 +1566,7 @@ MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 - + MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 @@ -1575,13 +1575,13 @@ ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) - + daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE @@ -1615,7 +1615,7 @@ .align 3 .L12_M2: andi M,MCO,2 # mr = 2 - beqz M,.L12_M1 + beqz M,.L12_M1 nop .L50: @@ -1636,7 +1636,7 @@ LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1644,7 +1644,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 @@ -1659,7 +1659,7 @@ LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) - + MOV t12,t11 beqz K,.L55 MOV t22,t11 @@ -1715,14 +1715,14 @@ .L55: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop -.L56: +.L56: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 @@ -1752,9 +1752,9 @@ #else andi K,TEMP, 1 #endif - beqz K,.L59 + beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -1767,10 +1767,10 @@ .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA @@ -1781,7 +1781,7 @@ ST t12,0(CO2) ST t22,1*SIZE(CO2) - daddu CO1,CO1,2*SIZE + daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) @@ -1827,7 +1827,7 @@ .align 3 .L12_M1: andi M,MCO,1 # mr = 1 - beqz M,.L0_N2_Loop + beqz M,.L0_N2_Loop nop .L60: @@ -1842,7 +1842,7 @@ daddu B, BO, TEMP #endif LD a0,0*SIZE(A) - + MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) @@ -1857,16 +1857,16 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t22,t11 beqz K,.L65 nop #else - dsra K,KCO,2 + dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) - + MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) @@ -1878,18 +1878,18 @@ #endif -.L61: # nr=2,mr=1,kr=4 +.L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 LD b4, 2*SIZE(B) MADD t11,t11,a0,b0 - + LD b5,3*SIZE(B) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 LD b2,4*SIZE(B) MADD t11,t11,a4,b4 - + LD b3,5*SIZE(B) MADD t12,t12,a4,b5 @@ -1897,17 +1897,17 @@ daddiu K,K,-1 LD b6,6*SIZE(B) MADD t11,t11,a2,b2 - + LD b7,7*SIZE(B) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE - - LD b0,0*SIZE(B) + + LD b0,0*SIZE(B) MADD t11,t11,a6,b6 - + LD b1,1*SIZE(B) bnez K,.L61 MADD t12,t12,a6,b7 @@ -1916,19 +1916,19 @@ .L65: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop -.L66: +.L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 LD b4,2*SIZE(B) daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 - + LD b5,3*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE @@ -1937,7 +1937,7 @@ LD a0,0(A) # a0 LD b0,0*SIZE(B) MADD t11,t11,a4,b4 - + LD b1,1*SIZE(B) MADD t12,t12,a4,b5 @@ -1948,9 +1948,9 @@ #else andi K,TEMP,1 #endif - beqz K,.L69 + beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 @@ -1961,14 +1961,14 @@ #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else @@ -1978,7 +1978,7 @@ ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2008,15 +2008,15 @@ move BO, B - .align 5 + .align 5 .L0_N1: andi N,NCO,1 # nr = 1 - beqz N,.L999 + beqz N,.L999 nop - move CO1,C - dsra M,MCO,2 - + move CO1,C + dsra M,MCO,2 + move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) @@ -2026,7 +2026,7 @@ beqz M,.L11_M2 daddu C,CO1,LDC -.L70: +.L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B @@ -2038,12 +2038,12 @@ daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) - + MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) - + MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 @@ -2057,19 +2057,19 @@ #else daddiu TEMP, KK, 1 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 beqz K,.L75 nop #else move B, BO # Reset B - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) - + MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) - + MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 @@ -2081,7 +2081,7 @@ .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 - + LD a4, 4*SIZE(A) MADD t21,t21,a1,b0 @@ -2097,7 +2097,7 @@ .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 - + LD a0,8*SIZE(A) MADD t21,t21,a5,b4 @@ -2106,17 +2106,17 @@ LD a2,10*SIZE(A) MADD t31,t31,a6,b4 - + LD a3,11*SIZE(A) MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 - + LD a4,12*SIZE(A) daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 - + LD a5,13*SIZE(A) MADD t21,t21,a1,b2 @@ -2131,7 +2131,7 @@ .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 - + LD a0,0*SIZE(A) daddu PREA,PREA,16*SIZE @@ -2150,20 +2150,20 @@ .L75: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop -.L76: +.L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - + LD a4,4*SIZE(A) daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 - + LD a5,5*SIZE(A) MADD t21,t21,a1,b0 FETCH $0,0(PREA) @@ -2193,16 +2193,16 @@ daddu PREA,PREA,8*SIZE - + .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L79 + beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -2217,7 +2217,7 @@ .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -2252,7 +2252,7 @@ FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) - daddu CO1,CO1,4*SIZE + daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT @@ -2271,7 +2271,7 @@ #ifdef LEFT daddiu KK, KK, 4 #endif - bnez M,.L70 + bnez M,.L70 nop #endif @@ -2279,10 +2279,10 @@ .align 3 .L11_M2: andi M,MCO,2 # mr = 2 - beqz M,.L11_M1 + beqz M,.L11_M1 nop -.L80: +.L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2312,13 +2312,13 @@ nop #else move B, BO - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) - + beqz K,.L85 LD a1,1*SIZE(A) @@ -2336,7 +2336,7 @@ MADD t11,t11,a4,b4 LD a3,5*SIZE(A) MADD t21,t21,a5,b4 - + LD b6, 3*SIZE(B) LD a6,6*SIZE(A) MADD t11,t11,a2,b2 @@ -2358,23 +2358,23 @@ .L85: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop -.L86: +.L86: LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 - + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 - + LD b0,0(B) LD a0,0*SIZE(A) MADD t11,t11,a4,b4 @@ -2382,16 +2382,16 @@ MADD t21,t21,a5,b4 - + .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L89 + beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -2401,7 +2401,7 @@ .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA @@ -2410,7 +2410,7 @@ ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) - + daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else @@ -2445,10 +2445,10 @@ .align 3 .L11_M1: andi M,MCO,1 # mr = 1 - beqz M,.L999 + beqz M,.L999 nop -.L90: +.L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2478,7 +2478,7 @@ move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) - dsra K,KCO,2 + dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif @@ -2487,7 +2487,7 @@ LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - + LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 @@ -2495,28 +2495,28 @@ LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 - + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 - + daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop -.L96: +.L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 @@ -2526,14 +2526,14 @@ LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 - + .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L99 + beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 diff --git a/kernel/mips64/sgemm_kernel_loongson3b_4x4.S b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S index 4a8c9b0e4..10c5f47de 100644 --- a/kernel/mips64/sgemm_kernel_loongson3b_4x4.S +++ b/kernel/mips64/sgemm_kernel_loongson3b_4x4.S @@ -110,7 +110,7 @@ #define F27 27 #define F26 26 #define F25 25 -#define F24 24 +#define F24 24 #define F23 23 #define F22 22 #define F21 21 @@ -118,7 +118,7 @@ #define F19 19 #define F18 18 #define F17 17 -#define F16 16 +#define F16 16 #define F15 15 #define F14 14 #define F13 13 @@ -130,14 +130,14 @@ #define F7 7 #define F6 6 #define F5 5 -#define F4 4 -#define F3 3 -#define F2 2 -#define F1 1 +#define F4 4 +#define F3 3 +#define F2 2 +#define F1 1 #define F0 0 PROLOGUE - + daddiu $sp, $sp, -160 sd $16, 0($sp) sd $17, 8($sp) @@ -160,7 +160,7 @@ ST $f23,144($sp) - .align 5 + .align 5 .L0_N4: # Loop N ST ALPHA,152($sp) # Backup ALPHA move MCO,M # Backup M @@ -170,26 +170,26 @@ move AO,A # Backup A_addr dsra N,NCO,2 # N=NCO/2 - + dsll LDC,LDC,BASE_SHIFT # LDC*8Byte dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 - + #if defined(TRMMKERNEL) - LDARG OFFSET,160($sp) # OFFSET is relate to the data part + LDARG OFFSET,160($sp) # OFFSET is relate to the data part #endif #if defined(TRMMKERNEL) && !defined(LEFT) - neg KK,OFFSET + neg KK,OFFSET #endif - + move BO,B # Backup B_addr beq N,$0,.L0_N2 # N=0,NCO<4 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte .L0_N4_Lb: # mr=4,nr=4 - move CO1,C + move CO1,C dsra M,MCO,2 # M=MCO/2 - + move A,AO # Reset A daddu CO2,C,LDC @@ -200,7 +200,7 @@ daddu CO4,CO3,LDC #if defined(TRMMKERNEL) && defined(LEFT) - move KK,OFFSET + move KK,OFFSET #endif beqz M,.L14_M2 daddu C,CO4,LDC # move C to next panel Cj @@ -227,18 +227,18 @@ MOV t41,t11 MOV t12,t11 LD b0,0(B) - + MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) MOV t42,t11 LD a2,2*SIZE(A) - + MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) - + MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) @@ -250,7 +250,7 @@ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK # temp is the length of the data part #elif defined(LEFT) - daddiu TEMP, KK, 4 # S=L,U=L + daddiu TEMP, KK, 4 # S=L,U=L #else daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part #endif @@ -259,7 +259,7 @@ beqz K,.L15 MOV t44,t11 -#else +#else move B,BO # Reset B MTC $0,t11 # GEMM part NR=4,MR=4 LD a0,0(A) @@ -271,7 +271,7 @@ MOV t41,t11 MOV t12,t11 LD b0,0(B) - + MOV t22,t11 MOV t32,t11 LD b1,1*SIZE(B) @@ -279,11 +279,11 @@ MOV t42,t11 dsra K,KCO,2 # K=KCO/2 LD a2,2*SIZE(A) - + MOV t13,t11 MOV t23,t11 LD b2,2*SIZE(B) - + MOV t33,t11 MOV t43,t11 LD a3,3*SIZE(A) @@ -296,7 +296,7 @@ beqz K,.L15 MOV t44,t11 # clear 16 results registers #endif - + .align 5 .L11: # kr=4 MADD t11,t11,a0,b0 @@ -306,29 +306,29 @@ MADD t12,t12,a0,b1 MADD t22,t22,a1,b1 LD a5,5*SIZE(A) - + MADD t31,t31,a2,b0 MADD t41,t41,a3,b0 LD b4,4*SIZE(B) - + MADD t32,t32,a2,b1 MADD t42,t42,a3,b1 LD b5,5*SIZE(B) FETCH $0,(PREB) - + MADD t13,t13,a0,b2 MADD t23,t23,a1,b2 LD a6,6*SIZE(A) - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 LD b6,6*SIZE(B) FETCH $0,(PREA) - + MADD t33,t33,a2,b2 MADD t43,t43,a3,b2 LD a7,7*SIZE(A) - + MADD t34,t34,a2,b3 MADD t44,t44,a3,b3 LD b7,7*SIZE(B) @@ -447,14 +447,14 @@ .L15: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP, 2 #endif beqz K,.L18 nop -.L16: +.L16: MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 LD a4,4*SIZE(A) @@ -528,16 +528,16 @@ daddu PREB,PREB,8*SIZE LD b3,3*SIZE(B) - + .L18: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L19 + beqz K,.L19 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREB) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -569,8 +569,8 @@ MADD t44,t44,a3,b3 .L19: # Write Back to C -#ifndef TRMMKERNEL - LD c11,0(CO1) # GEMM write part +#ifndef TRMMKERNEL + LD c11,0(CO1) # GEMM write part LD c21,1*SIZE(CO1) # get 16 C LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -640,11 +640,11 @@ daddu CO3,CO3,4*SIZE ST t44,3*SIZE(CO4) daddu PREB,BO,SPANB - - bnez M,.L10 + + bnez M,.L10 daddu CO4,CO4,4*SIZE -#else +#else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 @@ -685,7 +685,7 @@ daddiu CO1,CO1, 4 * SIZE daddiu CO2,CO2, 4 * SIZE daddiu CO3,CO3, 4 * SIZE - daddiu CO4,CO4, 4 * SIZE + daddiu CO4,CO4, 4 * SIZE FETCH $0,4*SIZE(CO1) FETCH $0,4*SIZE(CO2) @@ -698,7 +698,7 @@ FETCH $0,0(CO4) #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - dsubu TEMP,KCO,KK + dsubu TEMP,KCO,KK #ifdef LEFT daddiu TEMP,TEMP, -4 #else @@ -710,10 +710,10 @@ daddu B,B,TEMP # mov B to the end of panel Bj #endif -#ifdef LEFT +#ifdef LEFT daddiu KK, KK,4 #endif - bnez M,.L10 + bnez M,.L10 nop #endif @@ -721,7 +721,7 @@ .align 3 .L14_M2: andi M, MCO, 2 # nr=4,mr=2 - beqz M,.L14_M1 + beqz M,.L14_M1 nop .L20: @@ -729,7 +729,7 @@ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else - dsll K,KK,1 + BASE_SHIFT # mr=2 + dsll K,KK,1 + BASE_SHIFT # mr=2 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 daddu A,A,K daddu B,BO,TEMP @@ -738,7 +738,7 @@ LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) - + MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 @@ -764,18 +764,18 @@ MOV t24,t11 # clear 2*4=8 results registers #else - move B,BO # Reset B + move B,BO # Reset B LD a0,0*SIZE(A) MTC $0,t11 LD a1,1*SIZE(A) - + MOV t21,t11 LD b0,0*SIZE(B) MOV t12,t11 LD b1,1*SIZE(B) MOV t22,t11 - dsra K,KCO,2 + dsra K,KCO,2 LD b2,2*SIZE(B) MOV t13,t11 @@ -806,7 +806,7 @@ MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 - + MADD t11,t11,a4,b4 LD a2,4*SIZE(A) MADD t21,t21,a5,b4 @@ -866,7 +866,7 @@ MADD t24,t24,a7,b7 -.L25: +.L25: #ifndef TRMMKERNEL andi K,KCO,2 # kr=2 #else @@ -875,7 +875,7 @@ beqz K,.L28 nop -.L26: +.L26: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 @@ -890,7 +890,7 @@ LD b6,6*SIZE(B) MADD t23,t23,a1,b2 LD b7,7*SIZE(B) - + MADD t14,t14,a0,b3 MADD t24,t24,a1,b3 daddu A,A,4*SIZE # 2mr*2kr @@ -915,16 +915,16 @@ MADD t14,t14,a4,b7 MADD t24,t24,a5,b7 - -.L28: # kr=1 + +.L28: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L29 + beqz K,.L29 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # 2mr*kr @@ -942,11 +942,11 @@ .L29: # Write Back to C #ifndef TRMMKERNEL LD c11,0(CO1) # GEMM write back part - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + LD c13,0(CO3) MADD t11,c11,t11,ALPHA LD c23,1*SIZE(CO3) @@ -985,25 +985,25 @@ #else MUL t11, ALPHA, t11 # TRMM write back part MUL t21, ALPHA, t21 - + ST t11, 0 * SIZE(CO1) MUL t12, ALPHA, t12 ST t21, 1 * SIZE(CO1) MUL t22, ALPHA, t22 - + ST t12, 0 * SIZE(CO2) MUL t13, ALPHA, t13 ST t22, 1 * SIZE(CO2) MUL t23, ALPHA, t23 - + ST t13, 0 * SIZE(CO3) MUL t14, ALPHA, t14 ST t23, 1 * SIZE(CO3) MUL t24, ALPHA, t24 - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) - + daddiu CO1,CO1, 2 * SIZE daddiu CO2,CO2, 2 * SIZE daddiu CO3,CO3, 2 * SIZE @@ -1036,7 +1036,7 @@ .align 3 .L14_M1: - andi M,MCO,1 # mr=1 + andi M,MCO,1 # mr=1 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj nop @@ -1056,13 +1056,13 @@ MTC $0,t11 LD b0,0*SIZE(B) - + MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) - + MOV t14,t11 LD b3,3*SIZE(B) @@ -1077,35 +1077,35 @@ nop beqz K,.L35 nop - -#else + +#else move B,BO # Reset B, GEMM part dsra K,KCO,2 # K=KCO/2 LD a0, 0 * SIZE(A) # a0 MTC $0,t11 LD b0,0*SIZE(B) - + MOV t12,t11 LD b1,1*SIZE(B) MOV t13,t11 LD b2,2*SIZE(B) - + MOV t14,t11 beqz K,.L35 LD b3,3*SIZE(B) #endif -.L31: # nr=4,mr=1,kr=4 +.L31: # nr=4,mr=1,kr=4 LD a1, 1*SIZE(A) # load a1 MADD t11,t11,a0,b0 - + LD b4,4*SIZE(B) LD b5,5*SIZE(B) MADD t12,t12,a0,b1 - + LD b6,6*SIZE(B) LD b7,7*SIZE(B) MADD t13,t13,a0,b2 @@ -1113,11 +1113,11 @@ LD a2, 2*SIZE(A) # a2 MADD t11,t11,a1,b4 - + LD b0,8*SIZE(B) LD b1,9*SIZE(B) MADD t12,t12,a1,b5 - + LD b2,10*SIZE(B) LD b3,11*SIZE(B) MADD t13,t13,a1,b6 @@ -1126,12 +1126,12 @@ LD a3, 3*SIZE(A) # a3 MADD t11,t11,a2,b0 daddiu K,K,-1 - + LD b4,12*SIZE(B) LD b5,13*SIZE(B) MADD t12,t12,a2,b1 daddu A,A,4*SIZE # 1mr*4kr - + LD b6,14*SIZE(B) LD b7,15*SIZE(B) MADD t13,t13,a2,b2 @@ -1140,7 +1140,7 @@ LD a0, 0*SIZE(A) # a0 daddu B,B,16*SIZE # 4nr*4kr MADD t11,t11,a3,b4 - + LD b0,0*SIZE(B) MADD t12,t12,a3,b5 LD b1,1*SIZE(B) @@ -1154,14 +1154,14 @@ .L35: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L38 nop -.L36: +.L36: LD a1,1*SIZE(A) # load a1 MADD t11,t11,a0,b0 @@ -1169,10 +1169,10 @@ LD b5,5*SIZE(B) MADD t12,t12,a0,b1 daddu A,A,2*SIZE # mr*2kr - + LD b6,6*SIZE(B) MADD t13,t13,a0,b2 - + LD b7,7*SIZE(B) MADD t14,t14,a0,b3 daddu B,B,8*SIZE # 4nr*2kr @@ -1181,41 +1181,41 @@ .L37: LD a0,0(A) MADD t11,t11,a1,b4 - + LD b0,0*SIZE(B) LD b1,1*SIZE(B) MADD t12,t12,a1,b5 - + LD b2,2*SIZE(B) LD b3,3*SIZE(B) MADD t13,t13,a1,b6 MADD t14,t14,a1,b7 - - + + .L38: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L39 + beqz K,.L39 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 - daddu A,A,1*SIZE + daddu A,A,1*SIZE daddu B,B,4*SIZE - + MADD t13,t13,a0,b2 MADD t14,t14,a0,b3 .L39: # Write Back #ifndef TRMMKERNEL - LD c11,0(CO1) + LD c11,0(CO1) LD c12,0(CO2) LD c13,0(CO3) LD c14,0(CO4) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA MADD t13,c13,t13,ALPHA @@ -1261,22 +1261,22 @@ .L0_N4_Loop: # mc finished daddiu N,N,-1 # N-- #if defined(TRMMKERNEL) && !defined(LEFT) - daddiu KK, KK,4 + daddiu KK, KK,4 #endif - bnez N,.L0_N4_Lb + bnez N,.L0_N4_Lb move BO,B # Set BO point to next panel Bj - .align 5 + .align 5 .L0_N2: andi N,NCO,2 # nr = 2 - beqz N,.L0_N1 + beqz N,.L0_N1 nop .L0_N2_Lb: - move CO1,C + move CO1,C daddu CO2,C,LDC - dsra M,MCO,2 + dsra M,MCO,2 move A,AO # Reset A daddu PREA,AO,SPANA @@ -1288,13 +1288,13 @@ beqz M,.L12_M2 nop -.L40: +.L40: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B,BO # Reset B #else dsll K,KK, 2 + BASE_SHIFT - dsll TEMP, KK,1 + BASE_SHIFT + dsll TEMP, KK,1 + BASE_SHIFT daddu A,A,K daddu B,BO,TEMP @@ -1311,10 +1311,10 @@ MOV t41,t11 LD a2,2*SIZE(A) LD a3,3*SIZE(A) - + MOV t12,t11 MOV t22,t11 - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP,KCO,KK #elif defined(LEFT) @@ -1322,7 +1322,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t32,t11 beqz K,.L45 MOV t42,t11 @@ -1342,10 +1342,10 @@ LD a2,2*SIZE(A) dsra K,KCO,2 # K=KCO/2 LD a3,3*SIZE(A) - + MOV t12,t11 MOV t22,t11 - + MOV t32,t11 beqz K,.L45 MOV t42,t11 @@ -1411,9 +1411,9 @@ FETCH $0,8*SIZE(PREA) MADD t32,t32,a2,b3 MADD t42,t42,a3,b3 - + daddu A,A,16*SIZE # 4mr*4kr - daddu B,B,8*SIZE # 2nr*4kr + daddu B,B,8*SIZE # 2nr*4kr .L44: MADD t11,t11,a4,b6 @@ -1443,14 +1443,14 @@ .L45: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L48 nop -.L46: +.L46: MADD t11,t11,a0,b0 LD a4,4*SIZE(A) MADD t21,t21,a1,b0 @@ -1469,7 +1469,7 @@ FETCH $0,0(PREA) MADD t32,t32,a2,b1 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 - + MADD t42,t42,a3,b1 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE @@ -1495,16 +1495,16 @@ daddu PREA,PREA,8*SIZE - + .L48: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L49 + beqz K,.L49 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -1524,7 +1524,7 @@ .L49: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # gemm write back part Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -1545,7 +1545,7 @@ MADD t32,c32,t32,ALPHA ST t41,3*SIZE(CO1) MADD t42,c42,t42,ALPHA - daddiu M,M,-1 + daddiu M,M,-1 ST t12,0(CO2) ST t22,1*SIZE(CO2) @@ -1557,8 +1557,8 @@ FETCH $0,8*SIZE(CO1) FETCH $0,8*SIZE(CO2) - daddu CO1,CO1,4*SIZE - bnez M,.L40 + daddu CO1,CO1,4*SIZE + bnez M,.L40 daddu CO2,CO2,4*SIZE #else @@ -1566,7 +1566,7 @@ MUL t21, ALPHA, t21 MUL t31, ALPHA, t31 MUL t41, ALPHA, t41 - + MUL t12, ALPHA, t12 ST t11, 0 * SIZE(CO1) MUL t22, ALPHA, t22 @@ -1575,13 +1575,13 @@ ST t31, 2 * SIZE(CO1) MUL t42, ALPHA, t42 ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) daddiu M,M,-1 ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) - + daddiu CO1,CO1, 4*SIZE daddiu CO2,CO2, 4*SIZE @@ -1615,7 +1615,7 @@ .align 3 .L12_M2: andi M,MCO,2 # mr = 2 - beqz M,.L12_M1 + beqz M,.L12_M1 nop .L50: @@ -1636,7 +1636,7 @@ LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) - + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) dsubu TEMP, KCO, KK #elif defined(LEFT) @@ -1644,7 +1644,7 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t12,t11 beqz K,.L55 MOV t22,t11 @@ -1659,7 +1659,7 @@ LD b0,0*SIZE(B) MOV t21,t11 LD b1,1*SIZE(B) - + MOV t12,t11 beqz K,.L55 MOV t22,t11 @@ -1715,14 +1715,14 @@ .L55: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L58 nop -.L56: +.L56: MADD t11,t11,a0,b0 LD a4,2*SIZE(A) MADD t21,t21,a1,b0 @@ -1752,9 +1752,9 @@ #else andi K,TEMP, 1 #endif - beqz K,.L59 + beqz K,.L59 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -1767,10 +1767,10 @@ .L59: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # write gemm part back Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c12,0(CO2) LD c22,1*SIZE(CO2) - + MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA MADD t12,c12,t12,ALPHA @@ -1781,7 +1781,7 @@ ST t12,0(CO2) ST t22,1*SIZE(CO2) - daddu CO1,CO1,2*SIZE + daddu CO1,CO1,2*SIZE daddu CO2,CO2,2*SIZE FETCH $0,0(CO1) @@ -1827,7 +1827,7 @@ .align 3 .L12_M1: andi M,MCO,1 # mr = 1 - beqz M,.L0_N2_Loop + beqz M,.L0_N2_Loop nop .L60: @@ -1842,7 +1842,7 @@ daddu B, BO, TEMP #endif LD a0,0*SIZE(A) - + MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) @@ -1857,16 +1857,16 @@ #else daddiu TEMP, KK, 2 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 MOV t22,t11 beqz K,.L65 nop #else - dsra K,KCO,2 + dsra K,KCO,2 move B,BO # Reset B LD a0,0*SIZE(A) - + MTC $0,t11 MOV t21,t11 LD b0,0*SIZE(B) @@ -1878,18 +1878,18 @@ #endif -.L61: # nr=2,mr=1,kr=4 +.L61: # nr=2,mr=1,kr=4 LD a4, 1*SIZE(A) # a2 LD b4, 2*SIZE(B) MADD t11,t11,a0,b0 - + LD b5,3*SIZE(B) MADD t12,t12,a0,b1 LD a2, 2*SIZE(A) # a3 LD b2,4*SIZE(B) MADD t11,t11,a4,b4 - + LD b3,5*SIZE(B) MADD t12,t12,a4,b5 @@ -1897,17 +1897,17 @@ daddiu K,K,-1 LD b6,6*SIZE(B) MADD t11,t11,a2,b2 - + LD b7,7*SIZE(B) MADD t12,t12,a2,b3 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE - - LD b0,0*SIZE(B) + + LD b0,0*SIZE(B) MADD t11,t11,a6,b6 - + LD b1,1*SIZE(B) bnez K,.L61 MADD t12,t12,a6,b7 @@ -1916,19 +1916,19 @@ .L65: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L68 nop -.L66: +.L66: LD a4, 1*SIZE(A) # a1 MADD t11,t11,a0,b0 LD b4,2*SIZE(B) daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 - + LD b5,3*SIZE(B) MADD t12,t12,a0,b1 daddu B,B,4*SIZE @@ -1937,7 +1937,7 @@ LD a0,0(A) # a0 LD b0,0*SIZE(B) MADD t11,t11,a4,b4 - + LD b1,1*SIZE(B) MADD t12,t12,a4,b5 @@ -1948,9 +1948,9 @@ #else andi K,TEMP,1 #endif - beqz K,.L69 + beqz K,.L69 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t12,t12,a0,b1 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 @@ -1961,14 +1961,14 @@ #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C LD c12,0(CO2) - + MADD t11,c11,t11,ALPHA MADD t12,c12,t12,ALPHA ST t11,0(CO1) ST t12,0(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #else @@ -1978,7 +1978,7 @@ ST t11, 0 * SIZE(CO1) ST t12, 0 * SIZE(CO2) - daddu CO1,CO1,1*SIZE + daddu CO1,CO1,1*SIZE daddu CO2,CO2,1*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2008,15 +2008,15 @@ move BO, B - .align 5 + .align 5 .L0_N1: andi N,NCO,1 # nr = 1 - beqz N,.L999 + beqz N,.L999 nop - move CO1,C - dsra M,MCO,2 - + move CO1,C + dsra M,MCO,2 + move A,AO # Reset A daddu PREA,AO,SPANA #if defined(TRMMKERNEL) && defined(LEFT) @@ -2026,7 +2026,7 @@ beqz M,.L11_M2 daddu C,CO1,LDC -.L70: +.L70: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO # Reset B @@ -2038,12 +2038,12 @@ daddu B, BO, TEMP #endif LD b0, 0*SIZE(B) - + MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) - + MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 @@ -2057,19 +2057,19 @@ #else daddiu TEMP, KK, 1 #endif - dsra K,TEMP,2 + dsra K,TEMP,2 beqz K,.L75 nop #else move B, BO # Reset B - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) - + MTC $0,t11 LD a0,0*SIZE(A) MOV t21,t11 LD a1,1*SIZE(A) - + MOV t31,t11 LD a2,2*SIZE(A) MOV t41,t11 @@ -2081,7 +2081,7 @@ .L71: # nr=1,mr=kr=4 LD b4, 1*SIZE(B) # b1 MADD t11,t11,a0,b0 - + LD a4, 4*SIZE(A) MADD t21,t21,a1,b0 @@ -2097,7 +2097,7 @@ .L72: LD b2, 2*SIZE(B) # b2 MADD t11,t11,a4,b4 - + LD a0,8*SIZE(A) MADD t21,t21,a5,b4 @@ -2106,17 +2106,17 @@ LD a2,10*SIZE(A) MADD t31,t31,a6,b4 - + LD a3,11*SIZE(A) MADD t41,t41,a7,b4 .L73: LD b6, 3*SIZE(B) MADD t11,t11,a0,b2 - + LD a4,12*SIZE(A) daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 - + LD a5,13*SIZE(A) MADD t21,t21,a1,b2 @@ -2131,7 +2131,7 @@ .L74: LD b0, 0*SIZE(B) MADD t11,t11,a4,b6 - + LD a0,0*SIZE(A) daddu PREA,PREA,16*SIZE @@ -2150,20 +2150,20 @@ .L75: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L78 nop -.L76: +.L76: LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - + LD a4,4*SIZE(A) daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 - + LD a5,5*SIZE(A) MADD t21,t21,a1,b0 FETCH $0,0(PREA) @@ -2193,16 +2193,16 @@ daddu PREA,PREA,8*SIZE - + .L78: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L79 + beqz K,.L79 LD ALPHA,152($sp) # Get ALPHA - + FETCH $0,0(PREA) MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 @@ -2217,7 +2217,7 @@ .L79: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) LD c31,2*SIZE(CO1) LD c41,3*SIZE(CO1) @@ -2252,7 +2252,7 @@ FETCH $0,4*SIZE(CO1) FETCH $0,8*SIZE(CO1) - daddu CO1,CO1,4*SIZE + daddu CO1,CO1,4*SIZE #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) dsubu TEMP, KCO, KK #ifdef LEFT @@ -2271,7 +2271,7 @@ #ifdef LEFT daddiu KK, KK, 4 #endif - bnez M,.L70 + bnez M,.L70 nop #endif @@ -2279,10 +2279,10 @@ .align 3 .L11_M2: andi M,MCO,2 # mr = 2 - beqz M,.L11_M1 + beqz M,.L11_M1 nop -.L80: +.L80: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2312,13 +2312,13 @@ nop #else move B, BO - dsra K,KCO,2 + dsra K,KCO,2 LD b0, 0*SIZE(B) MTC $0,t11 MOV t21,t11 LD a0,0*SIZE(A) - + beqz K,.L85 LD a1,1*SIZE(A) @@ -2336,7 +2336,7 @@ MADD t11,t11,a4,b4 LD a3,5*SIZE(A) MADD t21,t21,a5,b4 - + LD b6, 3*SIZE(B) LD a6,6*SIZE(A) MADD t11,t11,a2,b2 @@ -2358,23 +2358,23 @@ .L85: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L88 nop -.L86: +.L86: LD b4, 1*SIZE(B) LD a4,2*SIZE(A) MADD t11,t11,a0,b0 LD a5,3*SIZE(A) MADD t21,t21,a1,b0 - + daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 - + LD b0,0(B) LD a0,0*SIZE(A) MADD t11,t11,a4,b4 @@ -2382,16 +2382,16 @@ MADD t21,t21,a5,b4 - + .L88: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L89 + beqz K,.L89 LD ALPHA,152($sp) # Get ALPHA - + MADD t11,t11,a0,b0 MADD t21,t21,a1,b0 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 @@ -2401,7 +2401,7 @@ .L89: # Write Back #ifndef TRMMKERNEL LD c11,0(CO1) # Fetch 16 C - LD c21,1*SIZE(CO1) + LD c21,1*SIZE(CO1) MADD t11,c11,t11,ALPHA MADD t21,c21,t21,ALPHA @@ -2410,7 +2410,7 @@ ST t21,1*SIZE(CO1) FETCH $0,2*SIZE(CO1) - + daddu CO1,CO1,2*SIZE # COx += 2*8Byte #else @@ -2445,10 +2445,10 @@ .align 3 .L11_M1: andi M,MCO,1 # mr = 1 - beqz M,.L999 + beqz M,.L999 nop -.L90: +.L90: #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B, BO @@ -2478,7 +2478,7 @@ move B, BO LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) - dsra K,KCO,2 + dsra K,KCO,2 beqz K,.L95 MTC $0,t11 #endif @@ -2487,7 +2487,7 @@ LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 - + LD a2, 2*SIZE(A) LD b2, 2*SIZE(B) MADD t11,t11,a4,b4 @@ -2495,28 +2495,28 @@ LD a6, 3*SIZE(A) LD b6, 3*SIZE(B) MADD t11,t11,a2,b2 - + daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 LD a0, 0*SIZE(A) LD b0, 0*SIZE(B) MADD t11,t11,a6,b6 - + daddiu K,K,-1 bnez K,.L91 nop .L95: # kr=2 #ifndef TRMMKERNEL - andi K,KCO,2 + andi K,KCO,2 #else andi K,TEMP,2 #endif beqz K,.L98 nop -.L96: +.L96: LD a4, 1*SIZE(A) LD b4, 1*SIZE(B) MADD t11,t11,a0,b0 @@ -2526,14 +2526,14 @@ LD b0,0(B) LD a0,0(A) MADD t11,t11,a4,b4 - + .L98: # kr=1 #ifndef TRMMKERNEL andi K,KCO,1 #else andi K,TEMP,1 #endif - beqz K,.L99 + beqz K,.L99 LD ALPHA,152($sp) # Get ALPHA MADD t11,t11,a0,b0 diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S index 04a48bdae..1ba061a7d 100644 --- a/kernel/mips64/snrm2.S +++ b/kernel/mips64/snrm2.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -65,7 +65,7 @@ PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) @@ -188,7 +188,7 @@ daddiu I, I, -1 cvt.d.s t1, a1 - + madd.d s1, s1, t1, t1 bgtz I, .L16 @@ -319,7 +319,7 @@ daddiu I, I, -1 cvt.d.s t1, a1 - + daddu X, X, INCX bgtz I, .L26 @@ -333,5 +333,5 @@ j $31 cvt.s.d s1, s1 - + EPILOGUE diff --git a/kernel/mips64/swap.S b/kernel/mips64/swap.S index d54abd7df..aa786edcf 100644 --- a/kernel/mips64/swap.S +++ b/kernel/mips64/swap.S @@ -70,7 +70,7 @@ #define b8 $f15 PROLOGUE - + li TEMP, SIZE NOP diff --git a/kernel/mips64/symv_L.S b/kernel/mips64/symv_L.S index 9a54eb789..f67d70ca7 100644 --- a/kernel/mips64/symv_L.S +++ b/kernel/mips64/symv_L.S @@ -91,7 +91,7 @@ PROLOGUE - + LDARG BUFFER, 0($sp) daddiu $sp, $sp, -32 diff --git a/kernel/mips64/symv_U.S b/kernel/mips64/symv_U.S index 285e591ef..5f2087653 100644 --- a/kernel/mips64/symv_U.S +++ b/kernel/mips64/symv_U.S @@ -89,7 +89,7 @@ PROLOGUE - + LDARG BUFFER, 0($sp) daddiu $sp, $sp, -32 diff --git a/kernel/mips64/trsm_kernel_LN.S b/kernel/mips64/trsm_kernel_LN.S index 28e1794b5..eb07aefdd 100644 --- a/kernel/mips64/trsm_kernel_LN.S +++ b/kernel/mips64/trsm_kernel_LN.S @@ -104,7 +104,7 @@ #define ALPHA $f15 PROLOGUE - + daddiu $sp, $sp, -144 SDARG $16, 0($sp) @@ -1695,7 +1695,7 @@ bgtz J, .L10 NOP .align 3 - + .L30: andi J, N, 4 blez J, .L50 diff --git a/kernel/mips64/trsm_kernel_LN_loongson3a.S b/kernel/mips64/trsm_kernel_LN_loongson3a.S index aba86fbce..4df2e4305 100644 --- a/kernel/mips64/trsm_kernel_LN_loongson3a.S +++ b/kernel/mips64/trsm_kernel_LN_loongson3a.S @@ -70,7 +70,7 @@ #define ALPHA $f15 PROLOGUE - + daddiu $sp, $sp, -144 SDARG $16, 0($sp) @@ -97,13 +97,13 @@ sdc1 $f23,136($sp) #endif # LN compute from bottom to top - LDARG OFFSET, 144($sp) + LDARG OFFSET, 144($sp) dsll LDC, LDC, BASE_SHIFT # ldc mult M, K mflo TEMP # TEMP=MC*KC - dsll TEMP, TEMP, BASE_SHIFT + dsll TEMP, TEMP, BASE_SHIFT daddu A, A, TEMP # A move to the end of sa dsll TEMP, M, BASE_SHIFT @@ -129,19 +129,19 @@ MOV t32, t11 MOV t42, t11 - daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai + daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai move AORIG, A # reset A daddu C, CO4, LDC # fixed pointer C, the write back address - - andi I, M, 1 # mr=2,nr=4 + + andi I, M, 1 # mr=2,nr=4 blez I, .L50 nop dsll TEMP, K, BASE_SHIFT # mr=1 dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai - dsll L, KK, BASE_SHIFT # mr=1 + dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 daddu AO, AORIG, L # AO point to the rectangular data part @@ -163,7 +163,7 @@ LD b1, 0 * SIZE(BO) # get 4b LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) - LD b4, 3 * SIZE(BO) + LD b4, 3 * SIZE(BO) dsra L, TEMP, 2 blez L, .L55 @@ -172,7 +172,7 @@ .align 3 .L52: - LD a5, 1 * SIZE(AO) + LD a5, 1 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) @@ -206,10 +206,10 @@ MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) @@ -230,7 +230,7 @@ andi L, TEMP, 3 blez L, .L58 nop - + .align 3 .L56: MADD t11, t11, a1, b1 # 3rd compute @@ -238,10 +238,10 @@ MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 - daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 4 * SIZE # BO += 4nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) @@ -303,16 +303,16 @@ MOV t42, t11 - -.L50: - andi I, M, 2 # mr=2,nr=4 + +.L50: + andi I, M, 2 # mr=2,nr=4 blez I, .L20 nop dsll TEMP, K, 1 + BASE_SHIFT dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai - dsll L, KK, 1 + BASE_SHIFT + dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 2 + BASE_SHIFT daddu AO, AORIG, L # AO point to the rectangular data part @@ -335,7 +335,7 @@ LD b1, 0 * SIZE(BO) # get 4b LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) - LD b4, 3 * SIZE(BO) + LD b4, 3 * SIZE(BO) dsra L, TEMP, 2 blez L, .L25 @@ -344,7 +344,7 @@ .align 3 .L22: - LD a5, 2 * SIZE(AO) + LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) @@ -392,10 +392,10 @@ MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -421,7 +421,7 @@ andi L, TEMP, 3 blez L, .L28 nop - + .align 3 .L26: MADD t11, t11, a1, b1 # 3rd compute @@ -433,10 +433,10 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -474,7 +474,7 @@ SUB t24, b8, t24 - LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b1, 3 * SIZE(AO) # computes the triangular_part LD b2, 2 * SIZE(AO) MUL t21, b1, t21 MUL t22, b1, t22 @@ -484,7 +484,7 @@ NMSUB t12, t12, b2, t22 NMSUB t13, t13, b2, t23 NMSUB t14, t14, b2, t24 - + LD b3, 0 * SIZE(AO) MUL t11, b3, t11 MUL t12, b3, t12 @@ -535,13 +535,13 @@ .L11: # mr=4 dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai - dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai + dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte daddu AO, AORIG, L # AO point to the rectangular data part daddu BO, B, TEMP - dsubu TEMP, K, KK + dsubu TEMP, K, KK LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai LD a2, 1 * SIZE(AO) # mr*KK with nr*KK @@ -551,7 +551,7 @@ LD b1, 0 * SIZE(BO) # get 4b LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) - LD b4, 3 * SIZE(BO) + LD b4, 3 * SIZE(BO) MOV t13, t11 # clear result registers MOV t23, t11 @@ -568,7 +568,7 @@ .align 3 .L12: - LD a5, 4 * SIZE(AO) + LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) @@ -596,7 +596,7 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 + MADD t44, t44, a4, b4 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) @@ -626,7 +626,7 @@ MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 - MADD t44, t44, a8, b8 + MADD t44, t44, a8, b8 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) @@ -656,12 +656,12 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 + MADD t44, t44, a4, b4 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -689,12 +689,12 @@ MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 - MADD t44, t44, a8, b8 + MADD t44, t44, a8, b8 daddiu L, L, -1 bgtz L, .L12 nop - + .align 3 .L15: @@ -704,7 +704,7 @@ .align 3 .L16: - MADD t11, t11, a1, b1 + MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 @@ -722,12 +722,12 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 + MADD t44, t44, a4, b4 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BO += 4nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -743,13 +743,13 @@ .L18: # deal with the triangular data part of panel Ai - daddiu TEMP, KK, -4 # + daddiu TEMP, KK, -4 # dsll L, TEMP, 2 + BASE_SHIFT dsll TEMP, TEMP, 2 + BASE_SHIFT daddu AO, AORIG, L # AO point to the triangular data part daddu BO, B, TEMP - + LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part LD b3, 2 * SIZE(BO) @@ -764,7 +764,7 @@ LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) - + SUB t21, b5, t21 SUB t22, b6, t22 SUB t23, b7, t23 @@ -774,12 +774,12 @@ LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) - + SUB t31, b1, t31 SUB t32, b2, t32 SUB t33, b3, t33 SUB t34, b4, t34 - + LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) @@ -792,10 +792,10 @@ LD b1, 15 * SIZE(AO) - LD b2, 14 * SIZE(AO) + LD b2, 14 * SIZE(AO) LD b4, 13 * SIZE(AO) LD b7, 12 * SIZE(AO) - + MUL t41, b1, t41 MUL t42, b1, t42 MUL t43, b1, t43 @@ -815,7 +815,7 @@ - LD b3, 10 * SIZE(AO) + LD b3, 10 * SIZE(AO) LD b5, 9 * SIZE(AO) LD b8, 8 * SIZE(AO) MUL t31, b3, t31 @@ -852,7 +852,7 @@ MUL t13, b2, t13 MUL t14, b2, t14 - daddiu CO1, CO1, -4 * SIZE # modify + daddiu CO1, CO1, -4 * SIZE # modify daddiu CO2, CO2, -4 * SIZE daddiu CO3, CO3, -4 * SIZE daddiu CO4, CO4, -4 * SIZE @@ -875,7 +875,7 @@ ST t43, 14 * SIZE(BO) ST t44, 15 * SIZE(BO) - ST t11, 0 * SIZE(CO1) # write back + ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) @@ -916,8 +916,8 @@ bgtz J, .L10 nop - - + + .align 3 .L30: andi J, N, 2 # nr=2 @@ -934,8 +934,8 @@ daddu KK, M, OFFSET move AORIG, A # reset A - - daddu C, CO2, LDC # fixed + + daddu C, CO2, LDC # fixed andi I, M, 1 # mr=1 blez I, .L60 @@ -968,7 +968,7 @@ .align 3 .L62: - LD a5, 1 * SIZE(AO) + LD a5, 1 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) @@ -989,10 +989,10 @@ MADD t11, t11, a3, b3 # 3rd compute MADD t12, t12, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1002,14 +1002,14 @@ daddiu L, L, -1 bgtz L, .L62 nop - + .align 3 .L65: andi L, TEMP, 3 blez L, .L68 nop - + .align 3 .L66: MADD t11, t11, a1, b1 # 3rd compute @@ -1017,10 +1017,10 @@ MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 - daddiu AO, AO, 1 * SIZE # AO += mr + daddiu AO, AO, 1 * SIZE # AO += mr daddiu BO, BO, 2 * SIZE # BO += 2nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1035,14 +1035,14 @@ dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L # Ao point to the triangular data part daddu BO, B, TEMP - + LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) SUB t11, b1, t11 SUB t12, b2, t12 - + LD b3, 0 * SIZE(AO) MUL t11, b3, t11 MUL t12, b3, t12 @@ -1101,7 +1101,7 @@ .align 3 .L42: - LD a5, 2 * SIZE(AO) + LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) @@ -1131,10 +1131,10 @@ MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1147,14 +1147,14 @@ daddiu L, L, -1 bgtz L, .L42 nop - + .align 3 .L45: andi L, TEMP, 3 blez L, .L48 nop - + .align 3 .L46: MADD t11, t11, a1, b1 # 3rd compute @@ -1162,10 +1162,10 @@ MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BO += 2nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1181,7 +1181,7 @@ dsll TEMP, TEMP, 1 + BASE_SHIFT daddu AO, AORIG, L # Ao point to the triangular data part daddu BO, B, TEMP - + LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) @@ -1192,13 +1192,13 @@ SUB t21, b3, t21 SUB t22, b4, t22 - LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b1, 3 * SIZE(AO) # computes the triangular_part LD b2, 2 * SIZE(AO) MUL t21, b1, t21 MUL t22, b1, t22 NMSUB t11, t11, b2, t21 NMSUB t12, t12, b2, t22 - + LD b3, 0 * SIZE(AO) MUL t11, b3, t11 MUL t12, b3, t12 @@ -1260,7 +1260,7 @@ .align 3 .L32: - LD a5, 4 * SIZE(AO) + LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) @@ -1308,10 +1308,10 @@ MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1331,30 +1331,30 @@ bgtz L, .L32 nop - + .align 3 .L35: andi L, TEMP, 3 blez L, .L38 nop - + .align 3 .L36: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 - + MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BO += 2nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1373,7 +1373,7 @@ daddu AO, AORIG, L # AO point to the triangular data part daddu BO, B, TEMP - + LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) @@ -1394,10 +1394,10 @@ LD b1, 15 * SIZE(AO) - LD b2, 14 * SIZE(AO) + LD b2, 14 * SIZE(AO) LD b4, 13 * SIZE(AO) LD b7, 12 * SIZE(AO) - + MUL t41, b1, t41 MUL t42, b1, t42 NMSUB t31, t31, b2, t41 @@ -1409,7 +1409,7 @@ - LD b3, 10 * SIZE(AO) + LD b3, 10 * SIZE(AO) LD b5, 9 * SIZE(AO) LD b8, 8 * SIZE(AO) MUL t31, b3, t31 @@ -1493,11 +1493,11 @@ dsll TEMP, K, BASE_SHIFT # mr=1 dsubu AORIG, AORIG, TEMP - + dsll L, KK, BASE_SHIFT daddu AO, AORIG, L # AO point to the rectangular data part - daddu BO, B, L + daddu BO, B, L dsubu TEMP, K, KK @@ -1508,10 +1508,10 @@ dsra L, TEMP, 2 blez L, .L95 nop - + .align 3 .L92: - LD a5, 1 * SIZE(AO) + LD a5, 1 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute @@ -1526,10 +1526,10 @@ MADD t11, t11, a3, b3 # 3rd compute - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute @@ -1537,7 +1537,7 @@ daddiu L, L, -1 bgtz L, .L92 nop - + .align 3 .L95: @@ -1549,10 +1549,10 @@ .L96: MADD t11, t11, a1, b1 # 3rd compute - daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 1 * SIZE # BO += 1nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) daddiu L, L, -1 @@ -1584,7 +1584,7 @@ daddiu KK, KK, -1 -.L90: +.L90: andi I, M, 2 blez I, .L80 NOP @@ -1594,7 +1594,7 @@ dsll TEMP, K, 1+BASE_SHIFT # mr=2 dsubu AORIG, AORIG, TEMP - + dsll L, KK, 1 + BASE_SHIFT dsll TEMP, KK, 0 + BASE_SHIFT @@ -1611,10 +1611,10 @@ dsra L, TEMP, 2 blez L, .L85 nop - + .align 3 .L82: - LD a5, 2 * SIZE(AO) + LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 1 * SIZE(BO) @@ -1638,10 +1638,10 @@ MADD t11, t11, a3, b3 # 3rd compute MADD t21, t21, a4, b3 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1652,7 +1652,7 @@ daddiu L, L, -1 bgtz L, .L82 nop - + .align 3 .L85: @@ -1665,10 +1665,10 @@ MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1692,11 +1692,11 @@ SUB t11, b1, t11 SUB t21, b2, t21 - LD b1, 3 * SIZE(AO) # computes the triangular_part + LD b1, 3 * SIZE(AO) # computes the triangular_part LD b2, 2 * SIZE(AO) MUL t21, b1, t21 NMSUB t11, t11, b2, t21 - + LD b3, 0 * SIZE(AO) MUL t11, b3, t11 @@ -1709,8 +1709,8 @@ ST t21, 1 * SIZE(CO1) daddiu KK, KK, -2 - - + + .align 3 .L80: dsra I, M, 2 @@ -1748,7 +1748,7 @@ .align 3 .L72: - LD a5, 4 * SIZE(AO) + LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) @@ -1784,10 +1784,10 @@ MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1802,7 +1802,7 @@ daddiu L, L, -1 bgtz L, .L72 nop - + .align 3 .L75: @@ -1817,10 +1817,10 @@ MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BO += 1nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1850,7 +1850,7 @@ SUB t41, b4, t41 LD b1, 15 * SIZE(AO) - LD b2, 14 * SIZE(AO) + LD b2, 14 * SIZE(AO) LD b4, 13 * SIZE(AO) LD b7, 12 * SIZE(AO) MUL t41, b1, t41 @@ -1860,7 +1860,7 @@ - LD b3, 10 * SIZE(AO) + LD b3, 10 * SIZE(AO) LD b5, 9 * SIZE(AO) LD b8, 8 * SIZE(AO) MUL t31, b3, t31 diff --git a/kernel/mips64/trsm_kernel_LT.S b/kernel/mips64/trsm_kernel_LT.S index 824e0457b..57f48c5c6 100644 --- a/kernel/mips64/trsm_kernel_LT.S +++ b/kernel/mips64/trsm_kernel_LT.S @@ -104,7 +104,7 @@ #define ALPHA $f15 PROLOGUE - + daddiu $sp, $sp, -144 SDARG $16, 0($sp) @@ -1686,7 +1686,7 @@ bgtz J, .L10 NOP .align 3 - + .L30: andi J, N, 4 blez J, .L50 diff --git a/kernel/mips64/trsm_kernel_LT_loongson3a.S b/kernel/mips64/trsm_kernel_LT_loongson3a.S index 4114d94ef..b06269ce2 100644 --- a/kernel/mips64/trsm_kernel_LT_loongson3a.S +++ b/kernel/mips64/trsm_kernel_LT_loongson3a.S @@ -70,7 +70,7 @@ #define ALPHA $f15 PROLOGUE - + daddiu $sp, $sp, -144 SDARG $16, 0($sp) @@ -97,7 +97,7 @@ sdc1 $f23,136($sp) #endif # LT compute from left to right, top to bottom - LDARG OFFSET, 144($sp) + LDARG OFFSET, 144($sp) dsll LDC, LDC, BASE_SHIFT # ldc dsra J, N, 2 # j = nc/4 @@ -122,7 +122,7 @@ dsra I, M, 2 # i = mc/4 move KK, OFFSET # kk is the length of the rectangular data part of panel Ai - move AO, A # reset A + move AO, A # reset A daddu C, CO4, LDC # fixed pointer C, the write back address blez I, .L20 nop @@ -137,7 +137,7 @@ LD b1, 0 * SIZE(B) # get 4b LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) - LD b4, 3 * SIZE(B) + LD b4, 3 * SIZE(B) MOV t13, t11 # clear result registers MOV t23, t11 @@ -155,7 +155,7 @@ .align 3 .L12: - LD a5, 4 * SIZE(AO) + LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) @@ -183,7 +183,7 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 + MADD t44, t44, a4, b4 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) @@ -213,7 +213,7 @@ MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 - MADD t44, t44, a8, b8 + MADD t44, t44, a8, b8 LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) @@ -243,12 +243,12 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 + MADD t44, t44, a4, b4 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -276,12 +276,12 @@ MADD t14, t14, a5, b8 MADD t24, t24, a6, b8 MADD t34, t34, a7, b8 - MADD t44, t44, a8, b8 + MADD t44, t44, a8, b8 daddiu L, L, -1 bgtz L, .L12 nop - + .align 3 .L15: @@ -291,7 +291,7 @@ .align 3 .L16: - MADD t11, t11, a1, b1 + MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 @@ -309,12 +309,12 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 + MADD t44, t44, a4, b4 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BO += 4nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -344,7 +344,7 @@ LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) - + SUB t21, b5, t21 SUB t22, b6, t22 SUB t23, b7, t23 @@ -354,12 +354,12 @@ LD b2, 9 * SIZE(BO) LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) - + SUB t31, b1, t31 SUB t32, b2, t32 SUB t33, b3, t33 SUB t34, b4, t34 - + LD b5, 12 * SIZE(BO) LD b6, 13 * SIZE(BO) LD b7, 14 * SIZE(BO) @@ -371,7 +371,7 @@ SUB t44, b8, t44 - LD a1, 0 * SIZE(AO) # sa stores in col major + LD a1, 0 * SIZE(AO) # sa stores in col major LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -392,7 +392,7 @@ NMSUB t43, t43, a4, t13 NMSUB t44, t44, a4, t14 - + LD a5, 5 * SIZE(AO) LD a6, 6 * SIZE(AO) LD a7, 7 * SIZE(AO) @@ -445,7 +445,7 @@ ST t43, 14 * SIZE(BO) ST t44, 15 * SIZE(BO) - ST t11, 0 * SIZE(CO1) # write back + ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) @@ -491,7 +491,7 @@ .align 3 .L20: - andi I, M, 2 # mr=2,nr=4 + andi I, M, 2 # mr=2,nr=4 blez I, .L50 nop @@ -510,7 +510,7 @@ LD b1, 0 * SIZE(B) # get 4b LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) - LD b4, 3 * SIZE(B) + LD b4, 3 * SIZE(B) dsra L, KK, 2 blez L, .L25 @@ -519,7 +519,7 @@ .align 3 .L22: - LD a5, 2 * SIZE(AO) + LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) @@ -567,10 +567,10 @@ MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -596,7 +596,7 @@ andi L, KK, 3 blez L, .L28 nop - + .align 3 .L26: MADD t11, t11, a1, b1 # 3rd compute @@ -608,10 +608,10 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -643,7 +643,7 @@ SUB t24, b8, t24 - LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b1, 0 * SIZE(AO) # computes the triangular_part LD b2, 1 * SIZE(AO) MUL t11, b1, t11 MUL t12, b1, t12 @@ -653,7 +653,7 @@ NMSUB t22, t22, b2, t12 NMSUB t23, t23, b2, t13 NMSUB t24, t24, b2, t14 - + LD b3, 3 * SIZE(AO) MUL t21, b3, t21 MUL t22, b3, t22 @@ -705,7 +705,7 @@ .align 3 .L50: - andi I, M, 1 # mr=1,nr=4 + andi I, M, 1 # mr=1,nr=4 blez I, .L29 nop @@ -723,7 +723,7 @@ LD b1, 0 * SIZE(B) # get 4b LD b2, 1 * SIZE(B) LD b3, 2 * SIZE(B) - LD b4, 3 * SIZE(B) + LD b4, 3 * SIZE(B) dsra L, KK, 2 blez L, .L55 @@ -732,7 +732,7 @@ .align 3 .L52: - LD a5, 1 * SIZE(AO) + LD a5, 1 * SIZE(AO) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) LD b7, 6 * SIZE(BO) @@ -765,10 +765,10 @@ MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += mr*4kr + daddiu AO, AO, 4 * SIZE # AO += mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) @@ -789,7 +789,7 @@ andi L, KK, 3 blez L, .L58 nop - + .align 3 .L56: MADD t11, t11, a1, b1 # 3rd compute @@ -797,10 +797,10 @@ MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 - daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) LD b3, 2 * SIZE(BO) @@ -823,7 +823,7 @@ SUB t14, b4, t14 - LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b1, 0 * SIZE(AO) # computes the triangular_part MUL t11, b1, t11 MUL t12, b1, t12 MUL t13, b1, t13 @@ -858,8 +858,8 @@ move B, BO # fixed panel Bj bgtz J, .L10 nop - - + + .align 3 .L30: andi J, N, 2 # nr=2 @@ -874,9 +874,9 @@ MOV t31, t11 MOV t41, t11 - move KK, OFFSET + move KK, OFFSET move AO, A # reset A - daddu C, CO2, LDC # fixed + daddu C, CO2, LDC # fixed dsra I, M, 2 # I = mc/4 blez I, .L40 @@ -902,7 +902,7 @@ .align 3 .L32: - LD a5, 4 * SIZE(AO) + LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) @@ -950,10 +950,10 @@ MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -973,30 +973,30 @@ bgtz L, .L32 nop - + .align 3 .L35: andi L, KK, 3 blez L, .L38 nop - + .align 3 .L36: MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 - + MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BO += 2nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1027,7 +1027,7 @@ SUB t41, b7, t41 SUB t42, b8, t42 - LD a1, 0 * SIZE(AO) # sa stores in col major + LD a1, 0 * SIZE(AO) # sa stores in col major LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1040,7 +1040,7 @@ NMSUB t41, t41, a4, t11 NMSUB t42, t42, a4, t12 - + LD a5, 5 * SIZE(AO) LD a6, 6 * SIZE(AO) LD a7, 7 * SIZE(AO) @@ -1091,7 +1091,7 @@ daddu AO, AO, L # move AO to the end of Ai daddu BO, BO, TEMP - daddiu KK, KK, 4 # + daddiu KK, KK, 4 # MTC $0, a1 MOV t11, a1 @@ -1122,12 +1122,12 @@ dsra L, KK, 2 blez L, .L45 - move BO, B # reset B + move BO, B # reset B .align 3 .L42: - LD a5, 2 * SIZE(AO) + LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) @@ -1157,10 +1157,10 @@ MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1173,14 +1173,14 @@ daddiu L, L, -1 bgtz L, .L42 nop - + .align 3 .L45: andi L, KK, 3 blez L, .L48 nop - + .align 3 .L46: MADD t11, t11, a1, b1 # 3rd compute @@ -1188,10 +1188,10 @@ MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BO += 2nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1211,13 +1211,13 @@ SUB t21, b3, t21 SUB t22, b4, t22 - LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b1, 0 * SIZE(AO) # computes the triangular_part LD b2, 1 * SIZE(AO) MUL t11, b1, t11 MUL t12, b1, t12 NMSUB t21, t21, b2, t11 NMSUB t22, t22, b2, t12 - + LD b3, 3 * SIZE(AO) MUL t21, b3, t21 MUL t22, b3, t22 @@ -1266,12 +1266,12 @@ dsra L, KK, 2 blez L, .L65 - move BO, B # reset B + move BO, B # reset B .align 3 .L62: - LD a5, 1 * SIZE(AO) + LD a5, 1 * SIZE(AO) LD b5, 2 * SIZE(BO) LD b6, 3 * SIZE(BO) @@ -1292,10 +1292,10 @@ MADD t11, t11, a3, b3 # 3rd compute MADD t12, t12, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += mr*4kr + daddiu AO, AO, 4 * SIZE # AO += mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1305,23 +1305,23 @@ daddiu L, L, -1 bgtz L, .L62 nop - + .align 3 .L65: andi L, KK, 3 blez L, .L68 nop - + .align 3 .L66: MADD t11, t11, a1, b1 # 3rd compute MADD t12, t12, a1, b2 - daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 2 * SIZE # BO += 2nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1336,7 +1336,7 @@ SUB t11, b1, t11 SUB t12, b2, t12 - LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b1, 0 * SIZE(AO) # computes the triangular_part MUL t11, b1, t11 MUL t12, b1, t12 @@ -1360,7 +1360,7 @@ .align 3 .L49: move B, BO - + .align 3 .L70: @@ -1396,7 +1396,7 @@ .align 3 .L72: - LD a5, 4 * SIZE(AO) + LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) @@ -1432,10 +1432,10 @@ MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1450,7 +1450,7 @@ daddiu L, L, -1 bgtz L, .L72 nop - + .align 3 .L75: @@ -1465,10 +1465,10 @@ MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BO += 1nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1490,7 +1490,7 @@ SUB t31, b3, t31 SUB t41, b4, t41 - LD a1, 0 * SIZE(AO) # sa stores in col major + LD a1, 0 * SIZE(AO) # sa stores in col major LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1556,10 +1556,10 @@ dsra L, KK, 2 blez L, .L85 move BO, B - + .align 3 .L82: - LD a5, 2 * SIZE(AO) + LD a5, 2 * SIZE(AO) LD a6, 3 * SIZE(AO) LD b5, 1 * SIZE(BO) @@ -1583,10 +1583,10 @@ MADD t11, t11, a3, b3 # 3rd compute MADD t21, t21, a4, b3 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1597,7 +1597,7 @@ daddiu L, L, -1 bgtz L, .L82 nop - + .align 3 .L85: @@ -1610,10 +1610,10 @@ MADD t11, t11, a1, b1 # 3rd compute MADD t21, t21, a2, b1 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1630,11 +1630,11 @@ SUB t11, b1, t11 SUB t21, b2, t21 - LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b1, 0 * SIZE(AO) # computes the triangular_part LD b2, 1 * SIZE(AO) MUL t11, b1, t11 NMSUB t21, t21, b2, t11 - + LD b3, 3 * SIZE(AO) MUL t21, b3, t21 @@ -1646,7 +1646,7 @@ daddiu CO1, CO1, 2 * SIZE - + dsubu TEMP, K, KK dsll L, TEMP, 1 + BASE_SHIFT dsll TEMP, TEMP, 0 + BASE_SHIFT @@ -1655,7 +1655,7 @@ daddiu KK, KK, 2 - + .align 3 .L90: andi I, M, 1 # mr=1 @@ -1670,10 +1670,10 @@ dsra L, KK, 2 blez L, .L95 move BO, B - + .align 3 .L92: - LD a5, 1 * SIZE(AO) + LD a5, 1 * SIZE(AO) LD b5, 1 * SIZE(BO) MADD t11, t11, a1, b1 # 1st compute @@ -1688,10 +1688,10 @@ MADD t11, t11, a3, b3 # 3rd compute - daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) MADD t11, t11, a7, b7 # 4th compute @@ -1699,7 +1699,7 @@ daddiu L, L, -1 bgtz L, .L92 nop - + .align 3 .L95: andi L, KK, 3 @@ -1710,10 +1710,10 @@ .L96: MADD t11, t11, a1, b1 # 3rd compute - daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr - - LD a1, 0 * SIZE(AO) # next + + LD a1, 0 * SIZE(AO) # next LD b1, 0 * SIZE(BO) daddiu L, L, -1 @@ -1726,16 +1726,16 @@ SUB t11, b1, t11 - LD b1, 0 * SIZE(AO) # computes the triangular_part + LD b1, 0 * SIZE(AO) # computes the triangular_part MUL t11, b1, t11 - + ST t11, 0 * SIZE(BO) ST t11, 0 * SIZE(CO1) daddiu CO1, CO1, 1 * SIZE - + dsubu TEMP, K, KK dsll L, TEMP, BASE_SHIFT dsll TEMP, TEMP, BASE_SHIFT diff --git a/kernel/mips64/trsm_kernel_RN_loongson3a.S b/kernel/mips64/trsm_kernel_RN_loongson3a.S index 790d7c981..0827bf7bf 100644 --- a/kernel/mips64/trsm_kernel_RN_loongson3a.S +++ b/kernel/mips64/trsm_kernel_RN_loongson3a.S @@ -70,7 +70,7 @@ #define t44 $f25 PROLOGUE - + daddiu $sp, $sp, -144 SDARG $16, 0($sp) @@ -104,19 +104,19 @@ neg KK, OFFSET # for RN OFFSET always 0 - dsra J, N, 2 # J = NC/4 + dsra J, N, 2 # J = NC/4 blez J, .L30 NOP .L10: daddiu J, J, -1 - + move CO1, C daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC - - move AO, A # A is the retangular matrix and B is the trigular matrix + + move AO, A # A is the retangular matrix and B is the trigular matrix daddu C, CO4, LDC # Fixed pointer C dsra I, M, 2 # I=MC/4 @@ -139,14 +139,14 @@ MOV t23, t11 MOV t33, t11 MOV t43, t11 - + MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 - + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa - LD a2, 1 * SIZE(AO) # get 4 a + LD a2, 1 * SIZE(AO) # get 4 a LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -188,7 +188,7 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 # fisrt + MADD t44, t44, a4, b4 # fisrt LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) @@ -250,9 +250,9 @@ MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # third - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -313,11 +313,11 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 + MADD t44, t44, a4, b4 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BP += 4nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -337,7 +337,7 @@ .L18: # .L18 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results - LD b3, 2 * SIZE(AO) + LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) # sa stored as col major SUB t11, b1, t11 @@ -398,7 +398,7 @@ NMSUB t34, t34, b4, t31 NMSUB t44, t44, b4, t41 - + LD b5, 5 * SIZE(BO) LD b6, 6 * SIZE(BO) LD b7, 7 * SIZE(BO) @@ -463,17 +463,17 @@ ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) - + ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t33, 2 * SIZE(CO3) ST t43, 3 * SIZE(CO3) - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) @@ -496,7 +496,7 @@ NOP .align 3 -.L20: +.L20: andi I, M, 2 # mr=2 blez I, .L50 nop @@ -515,14 +515,14 @@ MOV t23, t11 MOV t33, t11 MOV t43, t11 - + MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 - + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa - LD a2, 1 * SIZE(AO) # get 4 a + LD a2, 1 * SIZE(AO) # get 4 a LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b @@ -594,9 +594,9 @@ MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -641,9 +641,9 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BP += 4nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -698,7 +698,7 @@ NMSUB t14, t14, b4, t11 NMSUB t24, t24, b4, t21 - + LD b5, 5 * SIZE(BO) LD b6, 6 * SIZE(BO) LD b7, 7 * SIZE(BO) @@ -741,13 +741,13 @@ ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) - + ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) @@ -764,7 +764,7 @@ daddu BO, BO, TEMP # move BO to the end of this panel .align 3 -.L50: +.L50: andi I, M, 1 # mr=1 blez I, .L29 nop @@ -783,12 +783,12 @@ MOV t23, t11 MOV t33, t11 MOV t43, t11 - + MOV t14, t11 MOV t24, t11 MOV t34, t11 MOV t44, t11 - + LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj @@ -837,9 +837,9 @@ MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -869,9 +869,9 @@ MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 - daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 4 * SIZE # BP += 4nr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -907,7 +907,7 @@ NMSUB t13, t13, b3, t11 NMSUB t14, t14, b4, t11 - + LD b5, 5 * SIZE(BO) LD b6, 6 * SIZE(BO) LD b7, 7 * SIZE(BO) @@ -954,13 +954,13 @@ .align 3 .L29: move B, BO # change to next panel of Bj - daddiu KK, KK, 4 # rectangular data length increase by 4 + daddiu KK, KK, 4 # rectangular data length increase by 4 bgtz J, .L10 NOP .align 3 - + .L30: andi J, N, 2 blez J, .L70 @@ -968,8 +968,8 @@ move CO1, C daddu CO2, C, LDC - - move AO, A # A is the retangular matrix and B is the trigular matrix + + move AO, A # A is the retangular matrix and B is the trigular matrix daddu C, CO2, LDC # Fixed pointer C dsra I, M, 2 # I=MC/4 @@ -989,7 +989,7 @@ MOV t42, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa - LD a2, 1 * SIZE(AO) # get 4 a + LD a2, 1 * SIZE(AO) # get 4 a LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1055,9 +1055,9 @@ MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -1098,9 +1098,9 @@ MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BP += 2nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -1118,7 +1118,7 @@ .L38: # .L38 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results - LD b3, 2 * SIZE(AO) + LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) # sa stored as col major SUB t11, b1, t11 @@ -1147,7 +1147,7 @@ NMSUB t22, t22, b2, t21 NMSUB t32, t32, b2, t31 NMSUB t42, t42, b2, t41 - + LD b5, 3 * SIZE(BO) MUL t12, b5, t12 MUL t22, b5, t22 @@ -1169,7 +1169,7 @@ ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) @@ -1201,7 +1201,7 @@ MOV t22, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa - LD a2, 1 * SIZE(AO) # get 4 a + LD a2, 1 * SIZE(AO) # get 4 a LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj LD b2, 1 * SIZE(B) # get 4 b @@ -1241,9 +1241,9 @@ MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1271,9 +1271,9 @@ MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BP += 2nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1305,7 +1305,7 @@ MUL t21, b1, t21 NMSUB t12, t12, b2, t11 NMSUB t22, t22, b2, t21 - + LD b5, 3 * SIZE(BO) MUL t12, b5, t12 MUL t22, b5, t22 @@ -1371,9 +1371,9 @@ MADD t11, t11, a3, b3 MADD t12, t12, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1396,9 +1396,9 @@ MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 - daddiu AO, AO, 1 * SIZE # AO += mr + daddiu AO, AO, 1 * SIZE # AO += mr daddiu BO, BO, 2 * SIZE # BP += 2nr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) @@ -1421,7 +1421,7 @@ LD b2, 1 * SIZE(BO) MUL t11, b1, t11 NMSUB t12, t12, b2, t11 - + LD b5, 3 * SIZE(BO) MUL t12, b5, t12 @@ -1445,7 +1445,7 @@ .align 3 .L39: move B, BO # change to next panel of Bj - daddiu KK, KK, 2 # rectangular data length increase by 4 + daddiu KK, KK, 2 # rectangular data length increase by 4 @@ -1473,7 +1473,7 @@ MOV t41, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa - LD a2, 1 * SIZE(AO) # get 4 a + LD a2, 1 * SIZE(AO) # get 4 a LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) @@ -1520,9 +1520,9 @@ MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -1552,9 +1552,9 @@ MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BP += 1nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -1571,7 +1571,7 @@ .L78: # .L78 always deal with the trigular data part LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix LD b2, 1 * SIZE(AO) # Fixed results - LD b3, 2 * SIZE(AO) + LD b3, 2 * SIZE(AO) LD b4, 3 * SIZE(AO) # sa stored as col major SUB t11, b1, t11 @@ -1596,7 +1596,7 @@ ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) - + daddiu CO1, CO1, 4 * SIZE # fixed address @@ -1621,7 +1621,7 @@ MOV t21, t11 LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa - LD a2, 1 * SIZE(AO) # get 4 a + LD a2, 1 * SIZE(AO) # get 4 a LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj @@ -1654,9 +1654,9 @@ MADD t11, t11, a3, b3 MADD t21, t21, a4, b3 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -1680,9 +1680,9 @@ MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BP += 1nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -1712,7 +1712,7 @@ ST t11, 0 * SIZE(CO1) # write back results ST t21, 1 * SIZE(CO1) - + daddiu CO1, CO1, 2 * SIZE # fixed address @@ -1754,9 +1754,9 @@ MADD t11, t11, a3, b3 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1776,9 +1776,9 @@ .L96: MADD t11, t11, a1, b1 - daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BP += 1nr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1801,7 +1801,7 @@ ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute ST t11, 0 * SIZE(CO1) # write back results - + daddiu CO1, CO1, 1 * SIZE # fixed address diff --git a/kernel/mips64/trsm_kernel_RT.S b/kernel/mips64/trsm_kernel_RT.S index 81bbfec0f..adfe081a7 100644 --- a/kernel/mips64/trsm_kernel_RT.S +++ b/kernel/mips64/trsm_kernel_RT.S @@ -104,7 +104,7 @@ #define ALPHA $f15 PROLOGUE - + daddiu $sp, $sp, -144 SDARG $16, 0($sp) @@ -3495,7 +3495,7 @@ bgtz J, .L10 NOP .align 3 - + .L999: diff --git a/kernel/mips64/trsm_kernel_RT_loongson3a.S b/kernel/mips64/trsm_kernel_RT_loongson3a.S index cf20cf9e0..f37611db9 100644 --- a/kernel/mips64/trsm_kernel_RT_loongson3a.S +++ b/kernel/mips64/trsm_kernel_RT_loongson3a.S @@ -70,7 +70,7 @@ #define t44 $f25 PROLOGUE - + daddiu $sp, $sp, -144 SDARG $16, 0($sp) @@ -110,7 +110,7 @@ # Be carefull B has no effeck of mc!! mult N, LDC mflo TEMP - daddu C, C, TEMP # C point to the last colum of blockB + daddu C, C, TEMP # C point to the last colum of blockB dsubu KK, K, OFFSET # KC-KK is the length of rectangular data part of Bj @@ -126,17 +126,17 @@ move CO1, C move AORIG, A - + dsra I, M, 2 blez I, .L80 NOP .L31: # mr=4,nr=1 - dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, BASE_SHIFT # nr=1 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers MOV t21, t11 @@ -153,7 +153,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L35 NOP - + .align 3 .L32: @@ -193,9 +193,9 @@ MADD t31, t31, a3, b3 MADD t41, t41, a4, b3 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -227,9 +227,9 @@ MADD t31, t31, a3, b1 MADD t41, t41, a4, b1 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 1 * SIZE # BO += 2nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -277,11 +277,11 @@ ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) - + daddiu CO1, CO1, 4 * SIZE # fixed pointer - dsll TEMP, K, 2 + BASE_SHIFT + dsll TEMP, K, 2 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai daddiu I, I, -1 @@ -295,11 +295,11 @@ blez I, .L90 nop - dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll L, KK, 1 + BASE_SHIFT # mr=2 dsll TEMP, KK, BASE_SHIFT # nr=1 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers MOV t21, t11 @@ -312,7 +312,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L85 NOP - + .align 3 .L82: @@ -340,9 +340,9 @@ MADD t11, t11, a3, b3 MADD t21, t21, a4, b3 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -368,9 +368,9 @@ MADD t11, t11, a1, b1 MADD t21, t21, a2, b1 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -406,11 +406,11 @@ ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) - + daddiu CO1, CO1, 2 * SIZE # fixed pointer - dsll TEMP, K, 1 + BASE_SHIFT + dsll TEMP, K, 1 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai @@ -420,11 +420,11 @@ blez I, .L39 nop - dsll L, KK, BASE_SHIFT # mr=1 + dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, BASE_SHIFT # nr=1 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers @@ -434,7 +434,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L95 NOP - + .align 3 .L92: @@ -453,9 +453,9 @@ MADD t11, t11, a3, b3 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -477,9 +477,9 @@ .L96: MADD t11, t11, a1, b1 - daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 1 * SIZE # BO += 1nr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -508,11 +508,11 @@ ST t11, 0 * SIZE(AO) # updata packed A ST t11, 0 * SIZE(CO1) # write back - + daddiu CO1, CO1, 1 * SIZE # fixed pointer - dsll TEMP, K, BASE_SHIFT + dsll TEMP, K, BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai @@ -529,7 +529,7 @@ dsll TEMP, K, 1 + BASE_SHIFT # Kc*2nr move B to the beginning address of Bj dsubu B, B, TEMP - dsll TEMP, LDC, 1 # C + dsll TEMP, LDC, 1 # C dsubu C, C, TEMP move CO1, C @@ -542,11 +542,11 @@ NOP .L51: # mr=4,nr=2 - dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t21, t11 @@ -568,7 +568,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L55 NOP - + .align 3 .L52: @@ -626,9 +626,9 @@ MADD t32, t32, a3, b4 MADD t42, t42, a4, b4 - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -671,9 +671,9 @@ MADD t32, t32, a3, b2 MADD t42, t42, a4, b2 - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 2 * SIZE # BO += 2nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -749,16 +749,16 @@ ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) - + daddiu CO1, CO1, 4 * SIZE # fixed pointer daddiu CO2, CO2, 4 * SIZE - dsll TEMP, K, 2 + BASE_SHIFT + dsll TEMP, K, 2 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai daddiu I, I, -1 @@ -773,11 +773,11 @@ blez I, .L70 nop - dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll L, KK, 1 + BASE_SHIFT # mr=2 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t21, t11 @@ -793,7 +793,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L65 NOP - + .align 3 .L62: @@ -833,9 +833,9 @@ MADD t12, t12, a3, b4 MADD t22, t22, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -868,9 +868,9 @@ MADD t12, t12, a1, b2 MADD t22, t22, a2, b2 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 2 * SIZE # BO += 2nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -921,10 +921,10 @@ ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) - + daddiu CO1, CO1, 2 * SIZE # fixed pointer daddiu CO2, CO2, 2 * SIZE @@ -939,11 +939,11 @@ blez I, .L59 nop - dsll L, KK, BASE_SHIFT # mr=1 + dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, 1 + BASE_SHIFT # nr=2 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t12, t11 @@ -956,7 +956,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L75 NOP - + .align 3 .L72: @@ -984,9 +984,9 @@ MADD t11, t11, a3, b3 MADD t12, t12, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1012,9 +1012,9 @@ MADD t11, t11, a1, b1 MADD t12, t12, a1, b2 - daddiu AO, AO, 1 * SIZE # AO += 1mr + daddiu AO, AO, 1 * SIZE # AO += 1mr daddiu BO, BO, 2 * SIZE # BO += 2nr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1055,7 +1055,7 @@ ST t11, 0 * SIZE(CO1) # write back ST t12, 0 * SIZE(CO2) - + daddiu CO1, CO1, 1 * SIZE # fixed pointer daddiu CO2, CO2, 1 * SIZE @@ -1070,13 +1070,13 @@ .align 3 .L50: - dsra J, N, 2 # J = NC/4 + dsra J, N, 2 # J = NC/4 blez J, .L999 NOP .L10: dsll TEMP, K, 2 + BASE_SHIFT - dsubu B, B, TEMP # move B to the beginning address of Bj + dsubu B, B, TEMP # move B to the beginning address of Bj dsll TEMP, LDC, 2 dsubu C, C, TEMP # move C to the beginning address of Cj @@ -1087,7 +1087,7 @@ daddu CO2, C, LDC daddu CO3, CO2, LDC daddu CO4, CO3, LDC - + move AORIG, A # reset A dsra I, M, 2 # I=MC/4 @@ -1096,11 +1096,11 @@ .align 3 .L11: - dsll L, KK, 2 + BASE_SHIFT # mr=4 + dsll L, KK, 2 + BASE_SHIFT # mr=4 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 16 results registers MOV t21, t11 @@ -1132,7 +1132,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L15 NOP - + .align 3 .L12: @@ -1164,7 +1164,7 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 MADD t34, t34, a3, b4 - MADD t44, t44, a4, b4 # fisrt + MADD t44, t44, a4, b4 # fisrt LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) @@ -1226,9 +1226,9 @@ MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # third - daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr + daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -1293,9 +1293,9 @@ MADD t34, t34, a3, b4 MADD t44, t44, a4, b4 # third - daddiu AO, AO, 4 * SIZE # AO += 4mr + daddiu AO, AO, 4 * SIZE # AO += 4mr daddiu BO, BO, 4 * SIZE # BO += 4nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) LD a3, 2 * SIZE(AO) @@ -1381,7 +1381,7 @@ NMSUB t31, t31, b4, t34 NMSUB t41, t41, b4, t44 - + LD b5, 10 * SIZE(BO) LD b6, 9 * SIZE(BO) LD b7, 8 * SIZE(BO) @@ -1442,17 +1442,17 @@ ST t21, 1 * SIZE(CO1) ST t31, 2 * SIZE(CO1) ST t41, 3 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) ST t32, 2 * SIZE(CO2) ST t42, 3 * SIZE(CO2) - + ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) ST t33, 2 * SIZE(CO3) ST t43, 3 * SIZE(CO3) - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) ST t34, 2 * SIZE(CO4) @@ -1463,7 +1463,7 @@ daddiu CO3, CO3, 4 * SIZE daddiu CO4, CO4, 4 * SIZE - dsll TEMP, K, 2 + BASE_SHIFT + dsll TEMP, K, 2 + BASE_SHIFT daddu AORIG, AORIG, TEMP # move to next panel Ai daddiu I, I, -1 @@ -1476,11 +1476,11 @@ blez I, .L40 NOP - dsll L, KK, 1 + BASE_SHIFT # mr=2 + dsll L, KK, 1 + BASE_SHIFT # mr=2 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 8 results registers MOV t21, t11 @@ -1502,7 +1502,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L25 NOP - + .align 3 .L22: @@ -1566,9 +1566,9 @@ MADD t14, t14, a3, b4 MADD t24, t24, a4, b4 - daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr + daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -1615,9 +1615,9 @@ MADD t14, t14, a1, b4 MADD t24, t24, a2, b4 - daddiu AO, AO, 2 * SIZE # AO += 2mr + daddiu AO, AO, 2 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr - + LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) @@ -1677,7 +1677,7 @@ NMSUB t11, t11, b4, t14 NMSUB t21, t21, b4, t24 - + LD b5, 10 * SIZE(BO) LD b6, 9 * SIZE(BO) LD b7, 8 * SIZE(BO) @@ -1716,13 +1716,13 @@ ST t11, 0 * SIZE(CO1) # write back ST t21, 1 * SIZE(CO1) - + ST t12, 0 * SIZE(CO2) ST t22, 1 * SIZE(CO2) - + ST t13, 0 * SIZE(CO3) ST t23, 1 * SIZE(CO3) - + ST t14, 0 * SIZE(CO4) ST t24, 1 * SIZE(CO4) @@ -1741,11 +1741,11 @@ blez I, .L29 NOP - dsll L, KK, BASE_SHIFT # mr=1 + dsll L, KK, BASE_SHIFT # mr=1 dsll TEMP, KK, 2 + BASE_SHIFT # nr=4 - daddu AO, AORIG, L + daddu AO, AORIG, L daddu BO, B, TEMP # BO point to the retangular data part,also reset BO - dsubu TEMP, K, KK # temp = the length of rectangular data part + dsubu TEMP, K, KK # temp = the length of rectangular data part MTC $0, t11 # clear 4 results registers MOV t12, t11 @@ -1762,7 +1762,7 @@ dsra L, TEMP, 2 # L=(KC-offset)/4 blez L, .L45 NOP - + .align 3 .L42: @@ -1802,9 +1802,9 @@ MADD t13, t13, a3, b3 MADD t14, t14, a3, b4 - daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr + daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1836,9 +1836,9 @@ MADD t13, t13, a1, b3 MADD t14, t14, a1, b4 - daddiu AO, AO, 1 * SIZE # AO += 2mr + daddiu AO, AO, 1 * SIZE # AO += 2mr daddiu BO, BO, 4 * SIZE # BO += 4nr - + LD a1, 0 * SIZE(AO) LD b1, 0 * SIZE(BO) @@ -1879,7 +1879,7 @@ NMSUB t12, t12, b3, t14 NMSUB t11, t11, b4, t14 - + LD b5, 10 * SIZE(BO) LD b6, 9 * SIZE(BO) LD b7, 8 * SIZE(BO) diff --git a/kernel/mips64/zamax.S b/kernel/mips64/zamax.S index e993867ef..4a836292b 100644 --- a/kernel/mips64/zamax.S +++ b/kernel/mips64/zamax.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -70,7 +70,7 @@ #define s4 $f3 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/zamin.S b/kernel/mips64/zamin.S index bd1d509f1..0e3323bc3 100644 --- a/kernel/mips64/zamin.S +++ b/kernel/mips64/zamin.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -70,7 +70,7 @@ #define s4 $f3 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/zasum.S b/kernel/mips64/zasum.S index d6dc20584..cd22f9841 100644 --- a/kernel/mips64/zasum.S +++ b/kernel/mips64/zasum.S @@ -42,7 +42,7 @@ #define N $4 #define X $5 #define INCX $6 - + #define I $2 #define TEMP $3 @@ -64,7 +64,7 @@ #define s2 $f1 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/zaxpy.S b/kernel/mips64/zaxpy.S index 8a7b29a76..e7fa22e0e 100644 --- a/kernel/mips64/zaxpy.S +++ b/kernel/mips64/zaxpy.S @@ -86,7 +86,7 @@ #endif PROLOGUE - + LDARG INCY, 0($sp) li TEMP, 2 * SIZE @@ -95,7 +95,7 @@ sdc1 $f20, 0($sp) sdc1 $f21, 8($sp) #endif - + blez N, .L999 dsll INCX, INCX, ZBASE_SHIFT diff --git a/kernel/mips64/zcopy.S b/kernel/mips64/zcopy.S index 5a4ce9c98..8b3177243 100644 --- a/kernel/mips64/zcopy.S +++ b/kernel/mips64/zcopy.S @@ -44,7 +44,7 @@ #define INCX $6 #define Y $7 #define INCY $8 - + #define I $2 #define TEMP $3 @@ -58,7 +58,7 @@ #define a8 $f7 PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/zdot.S b/kernel/mips64/zdot.S index c50fe318e..c24dd1234 100644 --- a/kernel/mips64/zdot.S +++ b/kernel/mips64/zdot.S @@ -44,7 +44,7 @@ #define INCX $6 #define Y $7 #define INCY $8 - + #define I $2 #define TEMP $3 @@ -62,9 +62,9 @@ #define s3 $f2 #define s4 $f3 - + PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/zgemm3m_kernel.S b/kernel/mips64/zgemm3m_kernel.S index 14bb7469c..d3fbcf1fd 100644 --- a/kernel/mips64/zgemm3m_kernel.S +++ b/kernel/mips64/zgemm3m_kernel.S @@ -106,7 +106,7 @@ #define ALPHA_I $f16 PROLOGUE - + daddiu $sp, $sp, -128 SDARG $16, 0($sp) @@ -859,7 +859,7 @@ bgtz J, .L10 move B, BO .align 3 - + .L30: andi J, N, 4 blez J, .L50 @@ -1540,7 +1540,7 @@ LD $f1, 1 * SIZE(CO1) LD $f2, 2 * SIZE(CO1) LD $f3, 3 * SIZE(CO1) - + ADD c11, c11, c21 daddiu I, I, -1 ADD c12, c12, c22 diff --git a/kernel/mips64/zgemm_kernel.S b/kernel/mips64/zgemm_kernel.S index c48519c33..b9ac3b544 100644 --- a/kernel/mips64/zgemm_kernel.S +++ b/kernel/mips64/zgemm_kernel.S @@ -130,7 +130,7 @@ #endif PROLOGUE - + LDARG LDC, 0($sp) daddiu $sp, $sp, -128 @@ -759,7 +759,7 @@ bgtz J, .L10 move B, BO .align 3 - + .L20: andi J, N, 2 MTC $0, c11 diff --git a/kernel/mips64/zgemm_kernel_loongson3a_2x2.S b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S index a8faad2f6..ab673650f 100644 --- a/kernel/mips64/zgemm_kernel_loongson3a_2x2.S +++ b/kernel/mips64/zgemm_kernel_loongson3a_2x2.S @@ -143,7 +143,7 @@ #endif PROLOGUE - + LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE @@ -188,7 +188,7 @@ move KK, OFFSET #endif - daddiu J, J, -1 + daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 @@ -223,7 +223,7 @@ MOV c21, c11 MOV c22, c11 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - + MOV c23, c11 MOV c24, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 @@ -235,7 +235,7 @@ FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 - + FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 @@ -258,7 +258,7 @@ #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs @@ -272,7 +272,7 @@ MOV c21, c11 MOV c22, c11 gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - + MOV c23, c11 MOV c24, c11 gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 @@ -284,7 +284,7 @@ FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 - + FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 @@ -300,13 +300,13 @@ .align 5 .L12: - gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a1, b1 # axc A1xB1 + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F13, F12, 2) + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F16, F15, 3) + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd @@ -329,16 +329,16 @@ MADD2 c42, c42, a4, b3 MADD4 c44, c44, a4, b4 - gsLQC1(R12, F1, F0, 4) # unroll k=2 - gsLQC1(R13, F5, F4, 4) - MADD1 c11, c11, a5, b5 # axc A1xB1 + gsLQC1(R12, F1, F0, 4) # unroll k=2 + gsLQC1(R13, F5, F4, 4) + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd - gsLQC1(R12, F3, F2, 5) - gsLQC1(R13, F7, F6, 5) + gsLQC1(R12, F3, F2, 5) + gsLQC1(R13, F7, F6, 5) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 @@ -359,12 +359,12 @@ MADD4 c44, c44, a8, b8 gsLQC1(R12, F9, F8, 6) # Unroll K=3 - gsLQC1(R13, F13, F12, 6) - MADD1 c11, c11, a1, b1 # axc A1xB1 + gsLQC1(R13, F13, F12, 6) + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - gsLQC1(R13, F16, F15, 7) - gsLQC1(R12, F11, F10, 7) + gsLQC1(R13, F16, F15, 7) + gsLQC1(R12, F11, F10, 7) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd @@ -395,7 +395,7 @@ gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 @@ -442,17 +442,17 @@ .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - daddiu PREA, PREA, 4 * SIZE - daddiu PREB, PREB, 4 * SIZE + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 @@ -593,9 +593,9 @@ #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 - daddiu CO2,CO2, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE .align 5 .L30: @@ -620,7 +620,7 @@ gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 MTC $0, c11 # Clear results regs MOV c12, c11 - + gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MOV c13, c11 MOV c14, c11 @@ -642,7 +642,7 @@ dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 -#else +#else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 @@ -652,13 +652,13 @@ #else gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MTC $0, c11 # Clear results regs MOV c12, c11 - + gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MOV c13, c11 MOV c14, c11 @@ -681,29 +681,29 @@ .L32: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - gsLQC1(R13, F13, F12, 2) - MADD1 c11, c11, a1, b1 # axc A1xB1 + gsLQC1(R13, F13, F12, 2) + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - gsLQC1(R13, F16, F15, 3) + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd NOP MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - + FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 NOP - gsLQC1(R12, F9, F8, 2) # Unroll K=1 - gsLQC1(R13, F5, F4, 4) - MADD1 c11, c11, a3, b5 # axc A1xB1 + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + gsLQC1(R13, F5, F4, 4) + MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd - gsLQC1(R13, F7, F6, 5) + gsLQC1(R13, F7, F6, 5) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd NOP @@ -716,12 +716,12 @@ MADD4 c34, c34, a4, b8 daddiu L, L, -1 - gsLQC1(R12, F11, F10, 3) - gsLQC1(R13, F13, F12, 6) - MADD1 c11, c11, a5, b1 # axc A1xB1 + gsLQC1(R12, F11, F10, 3) + gsLQC1(R13, F13, F12, 6) + MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd - gsLQC1(R13, F16, F15, 7) + gsLQC1(R13, F16, F15, 7) MADD2 c12, c12, a6, b1 # bxc MADD4 c14, c14, a6, b2 # bxd daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx @@ -736,7 +736,7 @@ gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 - MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 @@ -767,7 +767,7 @@ .L36: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx @@ -777,8 +777,8 @@ daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - - daddiu PREB, PREB, 4 * SIZE + + daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 @@ -819,8 +819,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 ADD c12, c13, c12 @@ -845,8 +845,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -879,7 +879,7 @@ move B, BO .align 5 - + .L20: andi J, N, 1 blez J, .L999 @@ -938,7 +938,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 @@ -967,31 +967,31 @@ .align 5 .L22: - gsLQC1(R12, F9, F8, 2) # Unroll K=1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F11, F10, 3) + gsLQC1(R12, F11, F10, 3) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 - gsLQC1(R12, F1, F0, 4) # Unroll K=2 - MADD1 c11, c11, a5, b3 # axc A1xB1 + gsLQC1(R12, F1, F0, 4) # Unroll K=2 + MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd - gsLQC1(R13, F13, F12, 2) + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a6, b3 # bxc MADD4 c14, c14, a6, b4 # bxd - gsLQC1(R12, F3, F2, 5) + gsLQC1(R12, F3, F2, 5) MADD1 c21, c21, a7, b3 # A2xB1 MADD3 c23, c23, a7, b4 @@ -1001,14 +1001,14 @@ daddiu L, L, -1 gsLQC1(R12, F9, F8, 6) # Unroll K=3 - MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd - gsLQC1(R13, F16, F15, 3) + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a2, b5 # bxc MADD4 c14, c14, a2, b6 # bxd - gsLQC1(R12, F11, F10, 7) + gsLQC1(R12, F11, F10, 7) MADD1 c21, c21, a3, b5 # A2xB1 MADD3 c23, c23, a3, b6 daddiu BO, BO, 8 * SIZE # 1nr*4kr*cmpx @@ -1019,9 +1019,9 @@ daddiu AO, AO, 16 * SIZE # 2mr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd - daddiu PREA, PREA, 16 * SIZE + daddiu PREA, PREA, 16 * SIZE gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 MADD2 c12, c12, a6, b7 # bxc @@ -1051,7 +1051,7 @@ .L26: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx @@ -1142,7 +1142,7 @@ daddiu KK, KK, 2 #endif #endif - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP @@ -1184,7 +1184,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 @@ -1205,34 +1205,34 @@ .L42: gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - gsLQC1(R12, F9, F8, 2) # Unroll K=1 - MADD1 c11, c11, a3, b3 # axc A1xB1 + gsLQC1(R12, F9, F8, 2) # Unroll K=1 + MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd - gsLQC1(R13, F13, F12, 2) + gsLQC1(R13, F13, F12, 2) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd daddiu L, L, -1 - gsLQC1(R12, F11, F10, 3) - MADD1 c11, c11, a5, b5 # axc A1xB1 + gsLQC1(R12, F11, F10, 3) + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu AO, AO, 8 * SIZE # 2mr*4kr*cmpx - gsLQC1(R13, F16, F15, 3) + gsLQC1(R13, F16, F15, 3) MADD2 c12, c12, a6, b5 # bxc MADD4 c14, c14, a6, b6 # bxd daddiu BO, BO, 8 * SIZE # 2nr*4kr*cmpx gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 - MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 @@ -1259,7 +1259,7 @@ daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd @@ -1319,7 +1319,7 @@ daddiu KK, KK, 1 #endif - daddiu CO1,CO1, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE #endif diff --git a/kernel/mips64/zgemm_kernel_loongson3b_2x2.S b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S index 5ded7aed0..675cad054 100644 --- a/kernel/mips64/zgemm_kernel_loongson3b_2x2.S +++ b/kernel/mips64/zgemm_kernel_loongson3b_2x2.S @@ -144,7 +144,7 @@ #endif PROLOGUE - + LDARG LDC, 0($sp) daddiu $sp, $sp, -STACKSIZE @@ -190,7 +190,7 @@ move KK, OFFSET #endif - daddiu J, J, -1 + daddiu J, J, -1 dsra I, M, 1 # I=M/2 dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 @@ -228,7 +228,7 @@ LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) - + MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 @@ -241,7 +241,7 @@ FETCH $0, 0 * SIZE(CO1) MOV c33, c11 MOV c34, c11 - + FETCH $0, 4 * SIZE(CO2) MOV c41, c11 MOV c42, c11 @@ -264,7 +264,7 @@ #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B MTC $0, c11 # Clear results regs @@ -281,7 +281,7 @@ LD a3, 2 * SIZE(AO) MOV c22, c11 LD a4, 3 * SIZE(AO) - + MOV c23, c11 LD b3, 2 * SIZE(BO) MOV c24, c11 @@ -294,7 +294,7 @@ MOV c33, c11 MOV c34, c11 FETCH $0, 0 * SIZE(CO1) - + MOV c41, c11 MOV c42, c11 FETCH $0, 4 * SIZE(CO2) @@ -313,7 +313,7 @@ .L12: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) @@ -346,7 +346,7 @@ LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 8 * SIZE(BO) @@ -355,7 +355,7 @@ MADD4 c14, c14, a6, b6 # bxd LD a3, 10 * SIZE(AO) - LD a4, 11 * SIZE(AO) + LD a4, 11 * SIZE(AO) MADD1 c21, c21, a7, b5 # A2xB1 MADD3 c23, c23, a7, b6 @@ -379,7 +379,7 @@ LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 12 * SIZE(BO) @@ -418,7 +418,7 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd LD b1, 0 * SIZE(BO) @@ -469,17 +469,17 @@ .L16: daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx daddiu AO, AO, 4 * SIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd - daddiu PREA, PREA, 4 * SIZE - daddiu PREB, PREB, 4 * SIZE + daddiu PREA, PREA, 4 * SIZE + daddiu PREB, PREB, 4 * SIZE MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 @@ -624,9 +624,9 @@ #endif dsll PREB, K, 1 + ZBASE_SHIFT # PREA=K*2*2^4 - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L11 - daddiu CO2,CO2, 4 * SIZE + daddiu CO2,CO2, 4 * SIZE .align 5 .L30: @@ -652,7 +652,7 @@ LD a2, 1 * SIZE(AO) MTC $0, c11 # Clear results regs MOV c12, c11 - + LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MOV c13, c11 @@ -676,7 +676,7 @@ dsubu TEMP, K, KK #elif defined(LEFT) daddiu TEMP, KK, 1 # MR=1 -#else +#else daddiu TEMP, KK, 2 # NR=2 #endif dsra L, TEMP, 2 @@ -687,14 +687,14 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B LD b1, 0 * SIZE(BO) LD b2, 1 * SIZE(BO) MTC $0, c11 # Clear results regs MOV c12, c11 - + LD b3, 2 * SIZE(BO) LD b4, 3 * SIZE(BO) MOV c13, c11 @@ -719,19 +719,19 @@ .L32: LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd - + LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - + FETCH $0, 4 * SIZE(PREB) MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -739,14 +739,14 @@ LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a3, b5 # axc A1xB1 + MADD1 c11, c11, a3, b5 # axc A1xB1 MADD3 c13, c13, a3, b6 # axd LD b1, 8 * SIZE(BO) LD b2, 9 * SIZE(BO) MADD2 c12, c12, a4, b5 # bxc MADD4 c14, c14, a4, b6 # bxd - + LD b3, 10 * SIZE(BO) LD b4, 11 * SIZE(BO) MADD1 c31, c31, a3, b7 # A1xB2 @@ -759,7 +759,7 @@ LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) - MADD1 c11, c11, a5, b1 # axc A1xB1 + MADD1 c11, c11, a5, b1 # axc A1xB1 MADD3 c13, c13, a5, b2 # axd LD b5, 12 * SIZE(BO) @@ -782,7 +782,7 @@ LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a7, b5 # axc A1xB1 + MADD1 c11, c11, a7, b5 # axc A1xB1 MADD3 c13, c13, a7, b6 # axd LD b1, 0 * SIZE(BO) @@ -818,7 +818,7 @@ .L36: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 4 * SIZE # 2nr*1kr*cmpx @@ -828,8 +828,8 @@ daddiu AO, AO, 2 * SIZE # 2mr*1kr*cmpx MADD1 c31, c31, a1, b3 # A1xB2 MADD3 c33, c33, a1, b4 - - daddiu PREB, PREB, 4 * SIZE + + daddiu PREB, PREB, 4 * SIZE MADD2 c32, c32, a2, b3 MADD4 c34, c34, a2, b4 @@ -873,8 +873,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #else ADD c11, c14, c11 @@ -901,8 +901,8 @@ ST a3, 0 * SIZE(CO2) ST a4, 1 * SIZE(CO2) - daddiu CO1,CO1, 2 * SIZE - daddiu CO2,CO2, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -935,7 +935,7 @@ move B, BO .align 5 - + .L20: andi J, N, 1 blez J, .L999 @@ -998,7 +998,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B LD a1, 0 * SIZE(AO) @@ -1032,7 +1032,7 @@ .L22: LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd LD b3, 2 * SIZE(BO) @@ -1044,14 +1044,14 @@ LD a8, 7 * SIZE(AO) MADD1 c21, c21, a3, b1 # A2xB1 MADD3 c23, c23, a3, b2 - + FETCH $0, 4 * SIZE(PREA) MADD2 c22, c22, a4, b1 MADD4 c24, c24, a4, b2 LD a1, 8 * SIZE(AO) LD a2, 9 * SIZE(AO) - MADD1 c11, c11, a5, b3 # axc A1xB1 + MADD1 c11, c11, a5, b3 # axc A1xB1 MADD3 c13, c13, a5, b4 # axd LD b5, 4 * SIZE(BO) @@ -1071,7 +1071,7 @@ LD a5, 12 * SIZE(AO) LD a6, 13 * SIZE(AO) - MADD1 c11, c11, a1, b5 # axc A1xB1 + MADD1 c11, c11, a1, b5 # axc A1xB1 MADD3 c13, c13, a1, b6 # axd LD b7, 6 * SIZE(BO) @@ -1090,11 +1090,11 @@ FETCH $0, 12 * SIZE(PREA) MADD2 c22, c22, a4, b5 MADD4 c24, c24, a4, b6 - daddiu PREA, PREA, 16 * SIZE + daddiu PREA, PREA, 16 * SIZE LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a5, b7 # axc A1xB1 + MADD1 c11, c11, a5, b7 # axc A1xB1 MADD3 c13, c13, a5, b8 # axd LD b1, 0 * SIZE(BO) @@ -1127,7 +1127,7 @@ .L26: daddiu L, L, -1 - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd daddiu BO, BO, 2 * SIZE # 2nr*1kr*cmpx @@ -1224,7 +1224,7 @@ daddiu KK, KK, 2 #endif #endif - daddiu CO1,CO1, 4 * SIZE + daddiu CO1,CO1, 4 * SIZE bgtz I, .L21 NOP @@ -1270,7 +1270,7 @@ NOP #else - dsra L, K, 2 # Unroll K 4 times + dsra L, K, 2 # Unroll K 4 times move BO, B # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 @@ -1297,7 +1297,7 @@ # gsLQC1(R12, F3, F2, 1) # R:a3 I:a4 LD a3, 2 * SIZE(AO) LD a4, 3 * SIZE(AO) - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd # gsLQC1(R13, F7, F6, 1) # R:b2 I:b3 @@ -1306,27 +1306,27 @@ MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd -# gsLQC1(R12, F9, F8, 2) # Unroll K=1 +# gsLQC1(R12, F9, F8, 2) # Unroll K=1 LD a5, 4 * SIZE(AO) LD a6, 5 * SIZE(AO) - MADD1 c11, c11, a3, b3 # axc A1xB1 + MADD1 c11, c11, a3, b3 # axc A1xB1 MADD3 c13, c13, a3, b4 # axd -# gsLQC1(R13, F13, F12, 2) +# gsLQC1(R13, F13, F12, 2) LD b5, 4 * SIZE(BO) LD b6, 5 * SIZE(BO) MADD2 c12, c12, a4, b3 # bxc MADD4 c14, c14, a4, b4 # bxd -# gsLQC1(R12, F11, F10, 3) +# gsLQC1(R12, F11, F10, 3) LD a7, 6 * SIZE(AO) LD a8, 7 * SIZE(AO) - MADD1 c11, c11, a5, b5 # axc A1xB1 + MADD1 c11, c11, a5, b5 # axc A1xB1 MADD3 c13, c13, a5, b6 # axd daddiu L, L, -1 -# gsLQC1(R13, F16, F15, 3) +# gsLQC1(R13, F16, F15, 3) LD b7, 6 * SIZE(BO) LD b8, 7 * SIZE(BO) MADD2 c12, c12, a6, b5 # bxc @@ -1338,7 +1338,7 @@ # gsLQC1(R12, F1, F0, 0) # R:a1 I:a2 Unroll K=4 LD a1, 0 * SIZE(AO) LD a2, 1 * SIZE(AO) - MADD1 c11, c11, a7, b7 # axc A1xB1 + MADD1 c11, c11, a7, b7 # axc A1xB1 MADD3 c13, c13, a7, b8 # axd # gsLQC1(R13, F5, F4, 0) # R:b1 I:b2 @@ -1369,7 +1369,7 @@ daddiu BO, BO, 1 * SIZE * COMPSIZE # 2nr*1kr*cmpx daddiu AO, AO, 1 * SIZE * COMPSIZE # 2mr*1kr*cmpx - MADD1 c11, c11, a1, b1 # axc A1xB1 + MADD1 c11, c11, a1, b1 # axc A1xB1 MADD3 c13, c13, a1, b2 # axd MADD2 c12, c12, a2, b1 # bxc MADD4 c14, c14, a2, b2 # bxd @@ -1432,7 +1432,7 @@ daddiu KK, KK, 1 #endif - daddiu CO1,CO1, 2 * SIZE + daddiu CO1,CO1, 2 * SIZE #endif diff --git a/kernel/mips64/zgemv_n.S b/kernel/mips64/zgemv_n.S index c6cc89615..5709102f6 100644 --- a/kernel/mips64/zgemv_n.S +++ b/kernel/mips64/zgemv_n.S @@ -119,7 +119,7 @@ #endif PROLOGUE - + LDARG INCX, 0($sp) LDARG Y, 8($sp) LDARG INCY, 16($sp) diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c index 3b1b6f73b..f66818caf 100644 --- a/kernel/mips64/zgemv_n_loongson3a.c +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -1,4 +1,4 @@ -#include "common.h" +#include "common.h" //typedef int BLASLONG; //typedef double FLOAT; diff --git a/kernel/mips64/zgemv_t.S b/kernel/mips64/zgemv_t.S index f7f7fdf99..da702a57d 100644 --- a/kernel/mips64/zgemv_t.S +++ b/kernel/mips64/zgemv_t.S @@ -114,7 +114,7 @@ #endif PROLOGUE - + LDARG INCX, 0($sp) LDARG Y, 8($sp) LDARG INCY, 16($sp) @@ -143,7 +143,7 @@ dsll INCY, INCY, ZBASE_SHIFT li XORIG, 2 * SIZE - + beq INCX, XORIG, .L10 move XORIG, X @@ -449,10 +449,10 @@ .L19: LD a1, 0 * SIZE(Y) LD a2, 1 * SIZE(Y) - daddu Y, Y, INCY + daddu Y, Y, INCY LD a3, 0 * SIZE(Y) LD a4, 1 * SIZE(Y) - daddu Y, Y, INCY + daddu Y, Y, INCY MADD a1, a1, ALPHA_R, y1 MADD a2, a2, ALPHA_I, y1 @@ -468,12 +468,12 @@ ST a1, 0 * SIZE(YY) ST a2, 1 * SIZE(YY) - daddu YY, YY, INCY + daddu YY, YY, INCY ST a3, 0 * SIZE(YY) ST a4, 1 * SIZE(YY) bgtz J, .L11 - daddu YY, YY, INCY + daddu YY, YY, INCY .align 3 .L20: diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c index 3af44caf2..2fa71cc19 100644 --- a/kernel/mips64/zgemv_t_loongson3a.c +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -1,4 +1,4 @@ -#include "common.h" +#include "common.h" #define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) #define likely(x) __builtin_expect(!!(x), 1) diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S index 1f4a90eac..1c247bca9 100644 --- a/kernel/mips64/znrm2.S +++ b/kernel/mips64/znrm2.S @@ -43,7 +43,7 @@ #define X $5 #define INCX $6 #define XX $7 - + #define I $2 #define TEMP $3 @@ -71,7 +71,7 @@ PROLOGUE - + #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) diff --git a/kernel/mips64/zrot.S b/kernel/mips64/zrot.S index 0a205691c..35f90350f 100644 --- a/kernel/mips64/zrot.S +++ b/kernel/mips64/zrot.S @@ -44,7 +44,7 @@ #define INCX $6 #define Y $7 #define INCY $8 - + #define XX $9 #define YY $10 @@ -70,7 +70,7 @@ #define t4 $f3 PROLOGUE - + dsll INCX, INCX, ZBASE_SHIFT li TEMP, 2 * SIZE diff --git a/kernel/mips64/zscal.S b/kernel/mips64/zscal.S index 3feaf5a05..f11b1c8d0 100644 --- a/kernel/mips64/zscal.S +++ b/kernel/mips64/zscal.S @@ -67,7 +67,7 @@ #define t4 $f11 PROLOGUE - + li TEMP, 2 * SIZE MTC $0, a1 @@ -168,7 +168,7 @@ NOP .align 3 -.L50: +.L50: bne INCX, TEMP, .L60 dsra I, N, 2 diff --git a/kernel/mips64/zswap.S b/kernel/mips64/zswap.S index 663da23ff..84e1b97a8 100644 --- a/kernel/mips64/zswap.S +++ b/kernel/mips64/zswap.S @@ -70,7 +70,7 @@ #define b8 $f15 PROLOGUE - + LDARG INCY, 0($sp) li TEMP, 2 * SIZE diff --git a/kernel/mips64/zsymv_L.S b/kernel/mips64/zsymv_L.S index 65d5ce31b..1c19bc7c6 100644 --- a/kernel/mips64/zsymv_L.S +++ b/kernel/mips64/zsymv_L.S @@ -103,7 +103,7 @@ #endif PROLOGUE - + LDARG INCY, 0($sp) LDARG BUFFER, 8($sp) #ifdef __64BIT__ diff --git a/kernel/mips64/zsymv_U.S b/kernel/mips64/zsymv_U.S index 938d9118c..e972826ab 100644 --- a/kernel/mips64/zsymv_U.S +++ b/kernel/mips64/zsymv_U.S @@ -99,9 +99,9 @@ #define ADD1 MADD #define ADD2 NMSUB #endif - + PROLOGUE - + LDARG INCY, 0($sp) LDARG BUFFER, 8($sp) #ifdef __64BIT__ diff --git a/kernel/mips64/ztrsm_kernel_LT.S b/kernel/mips64/ztrsm_kernel_LT.S index 0e7011815..00a48a6fb 100644 --- a/kernel/mips64/ztrsm_kernel_LT.S +++ b/kernel/mips64/ztrsm_kernel_LT.S @@ -125,7 +125,7 @@ #endif PROLOGUE - + daddiu $sp, $sp, -128 SDARG $16, 0($sp) @@ -988,7 +988,7 @@ bgtz J, .L10 NOP .align 3 - + .L20: andi J, N, 2 blez J, .L30 diff --git a/kernel/mips64/ztrsm_kernel_RT.S b/kernel/mips64/ztrsm_kernel_RT.S index 1fc268466..89bc5467f 100644 --- a/kernel/mips64/ztrsm_kernel_RT.S +++ b/kernel/mips64/ztrsm_kernel_RT.S @@ -125,7 +125,7 @@ #endif PROLOGUE - + daddiu $sp, $sp, -128 SDARG $16, 0($sp) diff --git a/kernel/power/KERNEL.CELL b/kernel/power/KERNEL.CELL index 745e16e89..b17746448 100644 --- a/kernel/power/KERNEL.CELL +++ b/kernel/power/KERNEL.CELL @@ -23,8 +23,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_cell.S diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5 index af0960d1f..fbef79e59 100644 --- a/kernel/power/KERNEL.POWER5 +++ b/kernel/power/KERNEL.POWER5 @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel.S @@ -17,8 +17,8 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = diff --git a/kernel/power/KERNEL.POWER6 b/kernel/power/KERNEL.POWER6 index ef5f74472..344b205fe 100644 --- a/kernel/power/KERNEL.POWER6 +++ b/kernel/power/KERNEL.POWER6 @@ -1,6 +1,6 @@ SGEMMKERNEL = gemm_kernel_power6.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S SGEMMINCOPYOBJ = diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 5e2a7f9e4..988a4b701 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -61,12 +61,12 @@ CSCALKERNEL = zscal_ppc440.S ZSCALKERNEL = zscal_ppc440.S SGEMMKERNEL = gemm_kernel_ppc440.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_ppc440.S @@ -79,8 +79,8 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_ppc440.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index bfa43b7e8..7431a7788 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel.S diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index c41df975a..f615754bb 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -65,8 +65,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_g4.S diff --git a/kernel/power/amax.S b/kernel/power/amax.S index 7fbe39e7f..caa789d4d 100644 --- a/kernel/power/amax.S +++ b/kernel/power/amax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/amax_cell.S b/kernel/power/amax_cell.S index 3f25e75c7..d2d983667 100644 --- a/kernel/power/amax_cell.S +++ b/kernel/power/amax_cell.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 @@ -640,7 +640,7 @@ LL(28): fsub f16, f0, f8 fsel f0, f16, f0, f8 .align 4 - + LL(999): fsub f8, f0, f1 fsub f9, f2, f3 diff --git a/kernel/power/amax_hummer.S b/kernel/power/amax_hummer.S index 0d8b97db8..a3de92bd4 100644 --- a/kernel/power/amax_hummer.S +++ b/kernel/power/amax_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -86,7 +86,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/amax_ppc440.S b/kernel/power/amax_ppc440.S index 018449304..68de61c2a 100644 --- a/kernel/power/amax_ppc440.S +++ b/kernel/power/amax_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREX r8 diff --git a/kernel/power/amin.S b/kernel/power/amin.S index 01056c3d9..4aeb95276 100644 --- a/kernel/power/amin.S +++ b/kernel/power/amin.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/amin_cell.S b/kernel/power/amin_cell.S index e4179f52a..d95f503bc 100644 --- a/kernel/power/amin_cell.S +++ b/kernel/power/amin_cell.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 @@ -640,7 +640,7 @@ LL(28): fsub f16, f0, f8 fsel f0, f16, f8, f0 .align 4 - + LL(999): fsub f8, f0, f1 fsub f9, f2, f3 diff --git a/kernel/power/amin_hummer.S b/kernel/power/amin_hummer.S index f4bbf070b..b16faae98 100644 --- a/kernel/power/amin_hummer.S +++ b/kernel/power/amin_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -86,7 +86,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/amin_ppc440.S b/kernel/power/amin_ppc440.S index b47742bb7..6328629fe 100644 --- a/kernel/power/amin_ppc440.S +++ b/kernel/power/amin_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INC1 r6 #define PREX r8 diff --git a/kernel/power/asum.S b/kernel/power/asum.S index 1188aa5c1..e5dc9a64c 100644 --- a/kernel/power/asum.S +++ b/kernel/power/asum.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/asum_cell.S b/kernel/power/asum_cell.S index 076651f33..f409d0b44 100644 --- a/kernel/power/asum_cell.S +++ b/kernel/power/asum_cell.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/asum_hummer.S b/kernel/power/asum_hummer.S index 9906a4447..c3985fa3d 100644 --- a/kernel/power/asum_hummer.S +++ b/kernel/power/asum_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -72,7 +72,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) diff --git a/kernel/power/asum_ppc440.S b/kernel/power/asum_ppc440.S index c6ad0f066..ec929f45b 100644 --- a/kernel/power/asum_ppc440.S +++ b/kernel/power/asum_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREX r6 #define ATTR r7 diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S index 9f9605f91..190f82d6b 100644 --- a/kernel/power/axpy.S +++ b/kernel/power/axpy.S @@ -38,12 +38,12 @@ #define ASSEMBLER #include "common.h" - + #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 #define INCY r9 #define PREA r4 @@ -51,7 +51,7 @@ #else #define N r3 #define X r7 -#define INCX r8 +#define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 @@ -63,7 +63,7 @@ #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r8 -#define INCX r9 +#define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 @@ -71,7 +71,7 @@ #else #define N r3 #define X r7 -#define INCX r8 +#define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 @@ -82,7 +82,7 @@ #define ALPHA f24 #ifndef NEEDPARAM - + #define STACKSIZE 96 PROLOGUE @@ -108,7 +108,7 @@ #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCY, 56 + STACKSIZE(SP) #endif - + fmr ALPHA, f1 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT @@ -116,7 +116,7 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 diff --git a/kernel/power/axpy_hummer.S b/kernel/power/axpy_hummer.S index 372a846f6..f66b6529f 100644 --- a/kernel/power/axpy_hummer.S +++ b/kernel/power/axpy_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 #define INCY r9 @@ -634,7 +634,7 @@ LL(118): LL(999): li r10, 16 subi SP, SP, 16 - + lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S index cc2605cc0..df3f25e5f 100644 --- a/kernel/power/axpy_ppc440.S +++ b/kernel/power/axpy_ppc440.S @@ -38,12 +38,12 @@ #define ASSEMBLER #include "common.h" - + #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 #define INCY r9 #define YY r5 @@ -51,7 +51,7 @@ #else #define N r3 #define X r7 -#define INCX r8 +#define INCX r8 #define Y r9 #define INCY r10 #define YY r5 @@ -63,7 +63,7 @@ #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r8 -#define INCX r9 +#define INCX r9 #define Y r10 #define INCY r4 #define YY r6 @@ -71,7 +71,7 @@ #else #define N r3 #define X r7 -#define INCX r8 +#define INCX r8 #define Y r9 #define INCY r10 #define YY r5 @@ -106,7 +106,7 @@ #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCY, 56 + STACKSIZE(SP) #endif - + fmr ALPHA, f1 slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S index 930ea29e2..c115650fd 100644 --- a/kernel/power/cnrm2.S +++ b/kernel/power/cnrm2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 #define INCXM1 r9 diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S index e6b022f11..46c29c654 100644 --- a/kernel/power/cnrm2_hummer.S +++ b/kernel/power/cnrm2_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -336,7 +336,7 @@ LL(98): lfpdux f14, SP, r10 addi SP, SP, 16 blr -#endif +#endif .align 4 LL(99): @@ -517,7 +517,7 @@ LL(118): LL(198): LFDX A1, X, INCX2 fmadd C4, A1, A1, C4 - + fpadd C1, C1, C5 lis r3, 0x3f00 fpadd C2, C2, C6 diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S index 5ead68157..c71c34b7c 100644 --- a/kernel/power/cnrm2_ppc440.S +++ b/kernel/power/cnrm2_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PRE r8 #define INC1 r9 @@ -99,8 +99,8 @@ slwi INCX, INCX, ZBASE_SHIFT li INC1, SIZE - li PRE, 3 * 16 * SIZE - + li PRE, 3 * 16 * SIZE + cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 diff --git a/kernel/power/copy.S b/kernel/power/copy.S index 5a6c610c2..8f6773379 100644 --- a/kernel/power/copy.S +++ b/kernel/power/copy.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 @@ -63,7 +63,7 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 diff --git a/kernel/power/copy_hummer.S b/kernel/power/copy_hummer.S index 1efa6fb6d..19646bf02 100644 --- a/kernel/power/copy_hummer.S +++ b/kernel/power/copy_hummer.S @@ -38,12 +38,12 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 -#define INCY r7 +#define INCY r7 #define INCX2 r8 #define INCY2 r9 @@ -75,7 +75,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX @@ -287,7 +287,7 @@ LL(23): LL(25): andi. r0, N, 15 beq LL(999) - + andi. r0, N, 8 beq LL(26) @@ -430,7 +430,7 @@ LL(33): LL(35): andi. r0, N, 15 beq LL(999) - + andi. r0, N, 8 beq LL(36) @@ -711,7 +711,7 @@ LL(58): STFDUX A1, Y, INCY2 b LL(999) .align 4 - + # INCX == 1, INCY != 1 LL(60): @@ -857,7 +857,7 @@ LL(68): STFDUX A1, Y, INCY b LL(999) .align 4 - + LL(100): sub X, X, INCX sub Y, Y, INCY @@ -951,7 +951,7 @@ LL(999): lfpdux f15, SP, r10 lfpdux f14, SP, r10 - + addi SP, SP, 16 blr diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S index 4faa6c96c..4931f5ab1 100644 --- a/kernel/power/dnrm2_hummer.S +++ b/kernel/power/dnrm2_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -91,7 +91,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 @@ -330,7 +330,7 @@ LL(20): fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear - + fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 @@ -795,7 +795,7 @@ LL(120): fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear - + fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S index 6be9eadf3..849ca1f35 100644 --- a/kernel/power/dnrm2_ppc440.S +++ b/kernel/power/dnrm2_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define NN r6 #define XX r7 @@ -106,7 +106,7 @@ slwi INCX, INCX, BASE_SHIFT sub X, X, INCX - li PRE, 3 * 16 * SIZE + li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) diff --git a/kernel/power/dot.S b/kernel/power/dot.S index 724b0c3c1..cf96c18b7 100644 --- a/kernel/power/dot.S +++ b/kernel/power/dot.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 #define INCX r5 @@ -92,7 +92,7 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 diff --git a/kernel/power/dot_cell.S b/kernel/power/dot_cell.S index 617fb1356..f7bd077ba 100644 --- a/kernel/power/dot_cell.S +++ b/kernel/power/dot_cell.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 #define INCX r5 diff --git a/kernel/power/dot_hummer.S b/kernel/power/dot_hummer.S index 14a378090..1004f769c 100644 --- a/kernel/power/dot_hummer.S +++ b/kernel/power/dot_hummer.S @@ -38,12 +38,12 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 -#define INCY r7 +#define INCY r7 #define INCX2 r8 #define INCY2 r9 @@ -81,7 +81,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/dot_ppc440.S b/kernel/power/dot_ppc440.S index b3f3efc0e..5317c57ab 100644 --- a/kernel/power/dot_ppc440.S +++ b/kernel/power/dot_ppc440.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 #define INCX r5 diff --git a/kernel/power/exfunc.S b/kernel/power/exfunc.S index 257736c94..6e90945a1 100644 --- a/kernel/power/exfunc.S +++ b/kernel/power/exfunc.S @@ -41,7 +41,7 @@ .machine "any" - .globl .rpcc + .globl .rpcc .rpcc: mftb r3 rlinm r3, r3, 3, 0, 31 # ldc(scaling) diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index e531bde6f..969f54c61 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -90,7 +90,7 @@ fmr ALPHA, f1 lfs f0, 24(SP) - + cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 @@ -133,7 +133,7 @@ LL(12): addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 - + LL(15): andi. r0, M, 15 mtspr CTR, r0 @@ -221,7 +221,7 @@ LL(22): dcbtst PRE, CO1 bdnz LL(22) .align 4 - + LL(25): andi. r0, M, 15 mtspr CTR, r0 diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S index 2b7d1d99a..cae2fabca 100644 --- a/kernel/power/gemm_kernel.S +++ b/kernel/power/gemm_kernel.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -230,7 +230,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -322,7 +322,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 mr AO, A add C, CO4, LDC @@ -582,7 +582,7 @@ LL(12): LFD f30, 22 * SIZE(BO) LFD f31, 23 * SIZE(BO) #endif - + addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE @@ -778,7 +778,7 @@ LL(18): addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -1364,7 +1364,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 add C, CO2, LDC mr AO, A @@ -2273,7 +2273,7 @@ LL(78): fmr f1, f0 fmr f2, f0 fmr f3, f0 - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S index 6f5c3624f..8a525ef22 100644 --- a/kernel/power/gemm_kernel_altivec.S +++ b/kernel/power/gemm_kernel_altivec.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -707,7 +707,7 @@ LL(18): addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE - + addic. I, I, -1 bgt+ LL(11) b LL(20) @@ -809,7 +809,7 @@ LL(19): addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE - + addic. I, I, -1 bgt+ LL(11) .align 4 @@ -1917,7 +1917,7 @@ LL(118): FADD f0, f0, f2 FADD f1, f1, f3 - + FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 @@ -2629,7 +2629,7 @@ LL(178): STFD f0, 0 * SIZE(CO1) .align 4 - + LL(999): mr SP, STACK diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S index 010ed3945..ac750c2e8 100644 --- a/kernel/power/gemm_kernel_altivec_cell.S +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -710,7 +710,7 @@ LL(18): addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE - + addic. I, I, -1 bgt+ LL(11) b LL(20) @@ -812,7 +812,7 @@ LL(19): addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE - + addic. I, I, -1 bgt+ LL(11) .align 4 @@ -1920,7 +1920,7 @@ LL(118): FADD f0, f0, f2 FADD f1, f1, f3 - + FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 @@ -2632,7 +2632,7 @@ LL(178): STFD f0, 0 * SIZE(CO1) .align 4 - + LL(999): mr SP, STACK diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S index 24d437d19..26339afeb 100644 --- a/kernel/power/gemm_kernel_altivec_g4.S +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -646,7 +646,7 @@ LL(18): addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE - + addic. I, I, -1 bgt+ LL(11) b LL(20) @@ -748,7 +748,7 @@ LL(19): addi CO2, CO2, 16 * SIZE addi CO3, CO3, 16 * SIZE addi CO4, CO4, 16 * SIZE - + addic. I, I, -1 bgt+ LL(11) .align 4 @@ -1856,7 +1856,7 @@ LL(118): FADD f0, f0, f2 FADD f1, f1, f3 - + FMADD f0, f0, f13, f8 FMADD f1, f1, f13, f9 @@ -2568,7 +2568,7 @@ LL(178): STFD f0, 0 * SIZE(CO1) .align 4 - + LL(999): mr SP, STACK diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S index 0b0d75f50..1dbacc7f9 100644 --- a/kernel/power/gemm_kernel_cell.S +++ b/kernel/power/gemm_kernel_cell.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -228,7 +228,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -294,7 +294,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 mr AO, A add C, CO4, LDC @@ -715,7 +715,7 @@ LL(18): addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -1301,7 +1301,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 add C, CO2, LDC mr AO, A @@ -2210,7 +2210,7 @@ LL(78): fmr f1, f0 fmr f2, f0 fmr f3, f0 - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S index 1ee4b2853..b6c849965 100644 --- a/kernel/power/gemm_kernel_g4.S +++ b/kernel/power/gemm_kernel_g4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -263,7 +263,7 @@ fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 mr AO, A add C, CO4, LDC @@ -475,7 +475,7 @@ FMADD f15, A4, B4, f15 addi AO, AO, 4 * SIZE addi BO, BO, 4 * SIZE - + .align 4 .L18: @@ -582,7 +582,7 @@ addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -1149,7 +1149,7 @@ fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 add C, CO2, LDC mr AO, A @@ -2001,7 +2001,7 @@ fmr f1, f0 fmr f2, f0 fmr f3, f0 - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S index 6b4e6b9a0..3a8e1edfa 100644 --- a/kernel/power/gemm_kernel_hummer.S +++ b/kernel/power/gemm_kernel_hummer.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define ALPHA 0 #define FZERO 8 @@ -70,7 +70,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define CO3 r30 @@ -122,7 +122,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -197,7 +197,7 @@ #endif addi AO, A, -4 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -431,7 +431,7 @@ fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 - nop + nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 @@ -1626,7 +1626,7 @@ fsmfp A1, A2 fsmfp A3, A4 - + fxcpmadd f0, AP, f0, A1 fxcpmadd f1, AP, f1, A3 #else @@ -1687,7 +1687,7 @@ #endif addi AO, A, -2 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -2649,7 +2649,7 @@ mr CO1, C addi AO, A, -2 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -3507,7 +3507,7 @@ #endif addi AO, A, -4 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -3741,7 +3741,7 @@ fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 - nop + nop fxcsmadd f15, B4, A4, f15 bdnz+ .L1012 .align 4 @@ -5120,7 +5120,7 @@ #endif addi AO, A, -2 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -6147,7 +6147,7 @@ mr CO1, C addi AO, A, -2 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S index 92e8e9f5f..6fe2def67 100644 --- a/kernel/power/gemm_kernel_power3.S +++ b/kernel/power/gemm_kernel_power3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -189,7 +189,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -246,7 +246,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 mr AO, A add C, CO4, LDC @@ -522,7 +522,7 @@ LL(18): addi CO4, CO4, 4 * SIZE fmr f14, f31 fmr f15, f31 - + addic. I, I, -1 bgt+ LL(11) .align 4 @@ -880,7 +880,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 add C, CO2, LDC mr AO, A diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index b10a042dc..5f8fa76ce 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -746,7 +746,7 @@ LL(18): addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -1332,7 +1332,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 add C, CO2, LDC mr AO, A @@ -2238,7 +2238,7 @@ LL(78): fmr f1, f0 fmr f2, f0 fmr f3, f0 - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S index 5d3b3066b..2e86d5130 100644 --- a/kernel/power/gemm_kernel_ppc440.S +++ b/kernel/power/gemm_kernel_ppc440.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -259,7 +259,7 @@ fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 mr AO, A add C, CO4, LDC @@ -640,7 +640,7 @@ addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -1207,7 +1207,7 @@ fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 add C, CO2, LDC mr AO, A @@ -2059,7 +2059,7 @@ fmr f1, f0 fmr f2, f0 fmr f3, f0 - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S index 93c687bff..a4dcc49c1 100644 --- a/kernel/power/gemm_ncopy_4.S +++ b/kernel/power/gemm_ncopy_4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -204,7 +204,7 @@ LL(12): dcbt PREA, AO3 dcbt PREA, AO4 #endif - + dcbtst PREB1, B addi AO1, AO1, 4 * SIZE @@ -214,7 +214,7 @@ LL(12): addi B, B, 16 * SIZE bdnz LL(12) .align 4 - + LL(15): andi. r0, M, 3 mtspr CTR, r0 @@ -284,7 +284,7 @@ LL(22): addi B, B, 8 * SIZE bdnz LL(22) .align 4 - + LL(25): andi. r0, M, 3 mtspr CTR, r0 @@ -330,7 +330,7 @@ LL(32): addi B, B, 4 * SIZE bdnz LL(32) .align 4 - + LL(35): andi. r0, M, 3 mtspr CTR, r0 diff --git a/kernel/power/gemm_ncopy_hummer_4.S b/kernel/power/gemm_ncopy_hummer_4.S index f05fdaae5..7f5a55c3f 100644 --- a/kernel/power/gemm_ncopy_hummer_4.S +++ b/kernel/power/gemm_ncopy_hummer_4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -91,7 +91,7 @@ stfpdux f17, SP, r0 stfpdux f18, SP, r0 stfpdux f19, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) @@ -201,7 +201,7 @@ STFXDUX c08, B, INC2 bdnz .L12 .align 4 - + .L15: andi. r0, M, 7 ble .L19 @@ -323,7 +323,7 @@ STFXDUX c04, B, INC2 bdnz .L22 .align 4 - + .L25: andi. r0, M, 7 ble .L30 @@ -395,7 +395,7 @@ STFPDUX c04, B, INC2 bdnz .L32 .align 4 - + .L35: andi. r0, M, 7 ble .L99 @@ -529,7 +529,7 @@ STFPDUX c16, B, INC2 bdnz .L112 .align 4 - + .L115: andi. r0, M, 7 ble .L119 @@ -656,7 +656,7 @@ STFPDUX c12, B, INC2 bdnz .L122 .align 4 - + .L125: andi. r0, M, 7 ble .L130 @@ -738,7 +738,7 @@ STFPDUX c07, B, INC2 bdnz .L132 .align 4 - + .L135: andi. r0, M, 7 ble .L999 diff --git a/kernel/power/gemm_ncopy_hummer_8.S b/kernel/power/gemm_ncopy_hummer_8.S index fec7c139c..1c8adc920 100644 --- a/kernel/power/gemm_ncopy_hummer_8.S +++ b/kernel/power/gemm_ncopy_hummer_8.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -120,7 +120,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -237,7 +237,7 @@ STFXDUX c08, B, INC2 bdnz .L12 .align 4 - + .L15: andi. r0, M, 3 ble .L19 @@ -373,7 +373,7 @@ STFXDUX c08, B, INC2 bdnz .L22 .align 4 - + .L25: andi. r0, M, 7 ble .L30 @@ -441,12 +441,12 @@ fsmfp c01, c02 fsmfp c03, c04 - + STFPDUX c01, B, INC2 STFPDUX c03, B, INC2 .align 4 - + .L30: andi. J, N, 2 ble .L40 @@ -490,7 +490,7 @@ STFXDUX c16, B, INC2 bdnz .L32 .align 4 - + .L35: andi. r0, M, 7 ble .L40 @@ -562,7 +562,7 @@ STFPDUX c04, B, INC2 bdnz .L42 .align 4 - + .L45: andi. r0, M, 7 ble .L999 @@ -734,7 +734,7 @@ STFPDUX c32, B, INC2 bdnz .L112 .align 4 - + .L115: andi. r0, M, 7 ble .L119 @@ -936,7 +936,7 @@ STFPDUX c16, B, INC2 bdnz .L122 .align 4 - + .L125: andi. r0, M, 7 ble .L130 @@ -1013,7 +1013,7 @@ STFPDUX c05, B, INC2 .align 4 - + .L130: andi. J, N, 2 ble .L140 @@ -1059,7 +1059,7 @@ STFPDUX c12, B, INC2 bdnz .L132 .align 4 - + .L135: andi. r0, M, 7 ble .L140 @@ -1141,7 +1141,7 @@ STFPDUX c07, B, INC2 bdnz .L142 .align 4 - + .L145: andi. r0, M, 7 ble .L999 diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S index 712420f48..1b6af4801 100644 --- a/kernel/power/gemm_tcopy_4.S +++ b/kernel/power/gemm_tcopy_4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -58,7 +58,7 @@ #define B2 r17 #define B3 r18 #define M4 r19 - + #define c01 f0 #define c02 f1 #define c03 f2 @@ -235,7 +235,7 @@ LL(12): dcbt PREA, AO3 dcbt PREA, AO4 #endif - + dcbtst PREB1, B addi AO1, AO1, 4 * SIZE @@ -245,7 +245,7 @@ LL(12): add B1, B1, M4 bdnz LL(12) .align 4 - + LL(13): andi. r0, N, 2 ble LL(14) @@ -341,7 +341,7 @@ LL(22): add B1, B1, M4 bdnz LL(22) .align 4 - + LL(23): andi. r0, N, 2 ble LL(24) @@ -402,7 +402,7 @@ LL(32): add B1, B1, M4 bdnz LL(32) .align 4 - + LL(33): andi. r0, N, 2 ble LL(34) diff --git a/kernel/power/gemm_tcopy_hummer_4.S b/kernel/power/gemm_tcopy_hummer_4.S index dc94b046f..8352a8373 100644 --- a/kernel/power/gemm_tcopy_hummer_4.S +++ b/kernel/power/gemm_tcopy_hummer_4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -57,7 +57,7 @@ #define M4 r29 #define INC r30 #define INC2 r31 - + #define c01 f0 #define c02 f1 #define c03 f2 @@ -113,7 +113,7 @@ bne .L100 andi. r0, LDA, 2 * SIZE - 1 bne .L100 - + subi A, A, 2 * SIZE srawi. J, M, 2 ble .L20 @@ -154,7 +154,7 @@ STFPDUX c08, B1, INC2 bdnz .L12 .align 4 - + .L15: andi. r0, N, 3 ble .L19 @@ -224,7 +224,7 @@ STFPDUX c04, B1, INC2 bdnz .L22 .align 4 - + .L23: andi. r0, N, 2 ble .L24 @@ -268,7 +268,7 @@ STFPDUX c02, B1, INC2 bdnz .L32 .align 4 - + .L33: andi. r0, N, 2 ble .L34 @@ -353,7 +353,7 @@ STFPDUX c08, B1, INC2 bdnz .L112 .align 4 - + .L115: andi. r0, N, 3 ble .L119 @@ -433,7 +433,7 @@ STFPDUX c04, B1, INC2 bdnz .L122 .align 4 - + .L123: andi. r0, N, 2 ble .L124 @@ -484,7 +484,7 @@ STFPDUX c03, B1, INC2 bdnz .L132 .align 4 - + .L133: andi. r0, N, 2 ble .L134 diff --git a/kernel/power/gemm_tcopy_hummer_8.S b/kernel/power/gemm_tcopy_hummer_8.S index 5062f6536..e1770b314 100644 --- a/kernel/power/gemm_tcopy_hummer_8.S +++ b/kernel/power/gemm_tcopy_hummer_8.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -64,7 +64,7 @@ #define AO8 r29 #define INC r30 #define INC2 r31 - + #define c01 f0 #define c02 f1 #define c03 f2 @@ -124,7 +124,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -283,7 +283,7 @@ STFPDUX c32, B1, INC2 bdnz .L12 .align 4 - + .L15: andi. r0, N, 7 ble .L19 @@ -437,7 +437,7 @@ STFPDUX c16, B1, INC2 bdnz .L22 .align 4 - + .L25: andi. r0, N, 7 ble .L30 @@ -534,7 +534,7 @@ STFPDUX c08, B1, INC2 bdnz .L32 .align 4 - + .L35: andi. r0, N, 7 ble .L40 @@ -601,7 +601,7 @@ STFPDUX c04, B1, INC2 bdnz .L42 .align 4 - + .L45: andi. r0, N, 7 ble .L999 @@ -778,7 +778,7 @@ STFPDUX c32, B1, INC2 bdnz .L112 .align 4 - + .L115: andi. r0, N, 7 ble .L119 @@ -982,7 +982,7 @@ STFPDUX c16, B1, INC2 bdnz .L122 .align 4 - + .L125: andi. r0, N, 7 ble .L130 @@ -1111,7 +1111,7 @@ STFPDUX c15, B1, INC2 bdnz .L132 .align 4 - + .L135: andi. r0, N, 7 ble .L140 @@ -1202,7 +1202,7 @@ STFPDUX c07, B1, INC2 bdnz .L142 .align 4 - + .L145: andi. r0, N, 7 ble .L999 diff --git a/kernel/power/gemv_hummer_n.S b/kernel/power/gemv_hummer_n.S index a9340bebe..7f1b35ec1 100644 --- a/kernel/power/gemv_hummer_n.S +++ b/kernel/power/gemv_hummer_n.S @@ -375,7 +375,7 @@ cmpi cr0, 0, J, 0 bgt .L11 .align 4 - + .L20: andi. J, N, 2 ble .L30 @@ -870,7 +870,7 @@ cmpi cr0, 0, J, 0 bgt .L41 .align 4 - + .L50: andi. J, N, 2 ble .L60 @@ -1419,7 +1419,7 @@ cmpi cr0, 0, J, 0 bgt .L71 .align 4 - + .L80: andi. J, N, 2 ble .L90 diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index b66caa75c..2b19f0a4e 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -1559,7 +1559,7 @@ LL(19): cmpi cr0, 0, J, 0 bgt LL(11) .align 4 - + LL(20): andi. J, N, 4 mr AO1, A diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index a70e8b8a0..005e5d56c 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -797,7 +797,7 @@ LL(12): addi BO, BO, 16 * SIZE bdnz LL(12) - .align 4 + .align 4 LL(13): FMADD y01, a1, b1, y01 @@ -1551,7 +1551,7 @@ LL(19): cmpi cr0, 0, J, 0 bgt LL(11) .align 4 - + LL(20): andi. J, N, 7 ble LL(99) @@ -1778,7 +1778,7 @@ LL(22): addi BO, BO, 16 * SIZE bdnz LL(22) - .align 4 + .align 4 LL(23): FMADD y01, a1, b1, y01 @@ -2332,7 +2332,7 @@ LL(32): addi BO, BO, 16 * SIZE bdnz LL(32) - .align 4 + .align 4 LL(33): FMADD y01, a1, b1, y01 @@ -2594,7 +2594,7 @@ LL(40): mr AO1, A add A, A, LDA mr BO, XP - + lfd y01, FZERO fmr y02, y01 fmr y03, y01 @@ -2715,7 +2715,7 @@ LL(42): addi BO, BO, 16 * SIZE DCBT(AO1, PREA) bdnz LL(42) - .align 4 + .align 4 LL(43): FMADD y01, a1, b1, y01 diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S index 1aa59b214..62433af19 100644 --- a/kernel/power/gemv_t_ppc440.S +++ b/kernel/power/gemv_t_ppc440.S @@ -452,7 +452,7 @@ LL(22): #endif bdnz LL(22) - .align 4 + .align 4 LL(23): FMADD y01, a1, b1, y01 @@ -756,7 +756,7 @@ LL(32): LFDU b4, 1 * SIZE(X1) bdnz LL(32) - .align 4 + .align 4 LL(33): FMADD y01, a1, b1, y01 @@ -960,7 +960,7 @@ LL(42): LFDU b4, 1 * SIZE(X1) bdnz LL(42) - .align 4 + .align 4 LL(43): FMADD y01, a1, b1, y01 diff --git a/kernel/power/ger.S b/kernel/power/ger.S index 00685693a..bc10bf40d 100644 --- a/kernel/power/ger.S +++ b/kernel/power/ger.S @@ -326,7 +326,7 @@ LL(06): addi X1, X1, SIZE bdnz+ LL(06) .align 4 - + LL(10): srawi. J, N, 1 ble LL(20) @@ -834,7 +834,7 @@ LL(19): cmpi cr0, 0, J, 0 bgt LL(11) .align 4 - + LL(20): andi. J, N, 1 ble LL(999) diff --git a/kernel/power/iamax.S b/kernel/power/iamax.S index cdc57fa38..45461ae85 100644 --- a/kernel/power/iamax.S +++ b/kernel/power/iamax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -634,7 +634,7 @@ LL(1060): fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 @@ -768,7 +768,7 @@ LL(1160): fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1160) .align 4 diff --git a/kernel/power/iamax_hummer.S b/kernel/power/iamax_hummer.S index 9b2370970..9b3b225f8 100644 --- a/kernel/power/iamax_hummer.S +++ b/kernel/power/iamax_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -90,7 +90,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/iamax_ppc440.S b/kernel/power/iamax_ppc440.S index 11ea4cb74..a43cc773a 100644 --- a/kernel/power/iamax_ppc440.S +++ b/kernel/power/iamax_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -95,7 +95,7 @@ slwi INCX, INCX, BASE_SHIFT sub X, X, INCX li PRE, 3 * 16 * SIZE - + mr NN, N mr XX, X @@ -448,7 +448,7 @@ LL(1160): fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1160) .align 4 diff --git a/kernel/power/iamin.S b/kernel/power/iamin.S index c3dbb848a..477fd75c2 100644 --- a/kernel/power/iamin.S +++ b/kernel/power/iamin.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -635,7 +635,7 @@ LL(1060): fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 @@ -769,7 +769,7 @@ LL(1160): fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1160) .align 4 diff --git a/kernel/power/iamin_hummer.S b/kernel/power/iamin_hummer.S index 6dad3bec5..7a2c29c08 100644 --- a/kernel/power/iamin_hummer.S +++ b/kernel/power/iamin_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -90,7 +90,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/iamin_ppc440.S b/kernel/power/iamin_ppc440.S index 888e74a24..bbcc3019a 100644 --- a/kernel/power/iamin_ppc440.S +++ b/kernel/power/iamin_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -95,7 +95,7 @@ slwi INCX, INCX, BASE_SHIFT sub X, X, INCX li PRE, 3 * 16 * SIZE - + mr NN, N mr XX, X @@ -448,7 +448,7 @@ LL(1160): fabs f8, f8 addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1160) .align 4 diff --git a/kernel/power/imax.S b/kernel/power/imax.S index 6b6cd4560..33762b99a 100644 --- a/kernel/power/imax.S +++ b/kernel/power/imax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 diff --git a/kernel/power/imax_hummer.S b/kernel/power/imax_hummer.S index 110dc18b8..6ea6f5c00 100644 --- a/kernel/power/imax_hummer.S +++ b/kernel/power/imax_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -81,7 +81,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/imax_ppc440.S b/kernel/power/imax_ppc440.S index b4a644974..b6cea76e7 100644 --- a/kernel/power/imax_ppc440.S +++ b/kernel/power/imax_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 diff --git a/kernel/power/imin.S b/kernel/power/imin.S index 2dd774d10..dc7667286 100644 --- a/kernel/power/imin.S +++ b/kernel/power/imin.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 diff --git a/kernel/power/imin_hummer.S b/kernel/power/imin_hummer.S index d333329f6..f86e79c04 100644 --- a/kernel/power/imin_hummer.S +++ b/kernel/power/imin_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -81,7 +81,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/imin_ppc440.S b/kernel/power/imin_ppc440.S index 4e1185d1a..c84bdfac3 100644 --- a/kernel/power/imin_ppc440.S +++ b/kernel/power/imin_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -380,7 +380,7 @@ LL(1160): LFDUX f8, XX, INCX addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1160) .align 4 diff --git a/kernel/power/izamax.S b/kernel/power/izamax.S index 48510477d..8e1e1ef69 100644 --- a/kernel/power/izamax.S +++ b/kernel/power/izamax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -685,7 +685,7 @@ LL(1060): addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1060) b LL(9999) .align 4 @@ -885,7 +885,7 @@ LL(1160): addi RET, RET, 1 fcmpu cr0, f1, f8 - beq cr0, LL(9999) + beq cr0, LL(9999) bdnz LL(1160) .align 4 diff --git a/kernel/power/izamax_hummer.S b/kernel/power/izamax_hummer.S index 8dffa0c0c..1f1e48cf3 100644 --- a/kernel/power/izamax_hummer.S +++ b/kernel/power/izamax_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -91,7 +91,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/izamax_ppc440.S b/kernel/power/izamax_ppc440.S index f80c9ad17..76f18be57 100644 --- a/kernel/power/izamax_ppc440.S +++ b/kernel/power/izamax_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -96,7 +96,7 @@ slwi INCX, INCX, ZBASE_SHIFT sub X, X, INCX - li INC1, SIZE + li INC1, SIZE li PRE, 3 * 16 * SIZE mr NN, N diff --git a/kernel/power/izamin.S b/kernel/power/izamin.S index 17275fcec..ea1cdfa93 100644 --- a/kernel/power/izamin.S +++ b/kernel/power/izamin.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 diff --git a/kernel/power/izamin_hummer.S b/kernel/power/izamin_hummer.S index 75145abf5..f13cf1205 100644 --- a/kernel/power/izamin_hummer.S +++ b/kernel/power/izamin_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -91,7 +91,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/izamin_ppc440.S b/kernel/power/izamin_ppc440.S index 2cdb8bf38..eb902762b 100644 --- a/kernel/power/izamin_ppc440.S +++ b/kernel/power/izamin_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define RET r3 #define X r4 -#define INCX r5 +#define INCX r5 #define N r6 #define NN r7 @@ -96,7 +96,7 @@ slwi INCX, INCX, ZBASE_SHIFT sub X, X, INCX - li INC1, SIZE + li INC1, SIZE li PRE, 3 * 16 * SIZE mr NN, N diff --git a/kernel/power/max.S b/kernel/power/max.S index 5862bc930..71f055d6d 100644 --- a/kernel/power/max.S +++ b/kernel/power/max.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/max_hummer.S b/kernel/power/max_hummer.S index 01ff907e6..7e226ed05 100644 --- a/kernel/power/max_hummer.S +++ b/kernel/power/max_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -86,7 +86,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/max_ppc440.S b/kernel/power/max_ppc440.S index 7afdf566e..9a1247055 100644 --- a/kernel/power/max_ppc440.S +++ b/kernel/power/max_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/min.S b/kernel/power/min.S index 727a6a7b1..8a8054802 100644 --- a/kernel/power/min.S +++ b/kernel/power/min.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/min_hummer.S b/kernel/power/min_hummer.S index bd8268711..88a13a91c 100644 --- a/kernel/power/min_hummer.S +++ b/kernel/power/min_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -86,7 +86,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/min_ppc440.S b/kernel/power/min_ppc440.S index ab67bbc8e..5ffdfd026 100644 --- a/kernel/power/min_ppc440.S +++ b/kernel/power/min_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S index e2b635ee7..bf8433001 100644 --- a/kernel/power/nrm2.S +++ b/kernel/power/nrm2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define NN r6 #define XX r7 diff --git a/kernel/power/rot.S b/kernel/power/rot.S index b9e9338ac..3e6b8f7cc 100644 --- a/kernel/power/rot.S +++ b/kernel/power/rot.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 diff --git a/kernel/power/rot_ppc440.S b/kernel/power/rot_ppc440.S index bb19583b7..7a115deb7 100644 --- a/kernel/power/rot_ppc440.S +++ b/kernel/power/rot_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PRE r8 diff --git a/kernel/power/scal.S b/kernel/power/scal.S index f242f083c..7c65d1234 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define XX r4 #define PREA r5 @@ -65,7 +65,7 @@ #define FZERO f0 #define ALPHA f1 - + PROLOGUE PROFCODE diff --git a/kernel/power/scal_hummer.S b/kernel/power/scal_hummer.S index 0b584862a..fd7c6697a 100644 --- a/kernel/power/scal_hummer.S +++ b/kernel/power/scal_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define INCX2 r4 #define X2 r5 @@ -74,7 +74,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 - + li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S index 8b9e271cf..ed148834d 100644 --- a/kernel/power/scal_ppc440.S +++ b/kernel/power/scal_ppc440.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define XX r4 #define PRE r5 @@ -65,7 +65,7 @@ #define FZERO f0 #define ALPHA f1 - + PROLOGUE PROFCODE diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S index f235c6768..be974cc48 100644 --- a/kernel/power/snrm2.S +++ b/kernel/power/snrm2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S index a0024926f..a0ff3d1b2 100644 --- a/kernel/power/snrm2_hummer.S +++ b/kernel/power/snrm2_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S index ffda99ed4..0a80d1224 100644 --- a/kernel/power/snrm2_ppc440.S +++ b/kernel/power/snrm2_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PRE r8 @@ -98,7 +98,7 @@ #endif slwi INCX, INCX, BASE_SHIFT - li PRE, 3 * 16 * SIZE + li PRE, 3 * 16 * SIZE sub X, X, INCX diff --git a/kernel/power/swap.S b/kernel/power/swap.S index a0d150f3e..f8b56d472 100644 --- a/kernel/power/swap.S +++ b/kernel/power/swap.S @@ -38,12 +38,12 @@ #define ASSEMBLER #include "common.h" - + #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 #define INCY r9 #define PREA r4 @@ -52,7 +52,7 @@ #else #define N r3 #define X r7 -#define INCX r8 +#define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 @@ -65,7 +65,7 @@ #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r8 -#define INCX r9 +#define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 @@ -74,7 +74,7 @@ #else #define N r3 #define X r7 -#define INCX r8 +#define INCX r8 #define Y r9 #define INCY r10 #define PREA r4 @@ -124,7 +124,7 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 diff --git a/kernel/power/swap_hummer.S b/kernel/power/swap_hummer.S index 293a28bec..fa65acdbf 100644 --- a/kernel/power/swap_hummer.S +++ b/kernel/power/swap_hummer.S @@ -41,9 +41,9 @@ #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 -#define INCY r9 +#define INCY r9 #define INCX2 r4 #define INCY2 r5 @@ -78,7 +78,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 - + slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX @@ -293,7 +293,7 @@ LL(23): LL(25): andi. r0, N, 7 beq LL(29) - + andi. r0, N, 4 beq LL(27) @@ -447,7 +447,7 @@ LL(33): LL(35): andi. r0, N, 7 beq LL(39) - + andi. r0, N, 4 beq LL(37) @@ -696,7 +696,7 @@ LL(999): lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 - + addi SP, SP, 16 blr diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index 91bfb5e0b..fbf735abd 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -409,7 +409,7 @@ LL(11): LFD a16, 3 * SIZE(AO4) - LFD a5, ALPHA + LFD a5, ALPHA FMUL xsum1, atemp1, a1 FMUL xsum2, atemp1, a2 @@ -522,7 +522,7 @@ LL(12): FMADD y04, atemp2, a8, y04 # DCBT(X, PREX) NOP2 - + FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 @@ -1211,7 +1211,7 @@ LL(18): LFD y03, 2 * SIZE(YY) LFD y04, 3 * SIZE(YY) - LFD xtemp1, ALPHA + LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 @@ -1254,7 +1254,7 @@ LL(20): LFD a2, 1 * SIZE(AO1) LFD a6, 1 * SIZE(AO2) - LFD a5, ALPHA + LFD a5, ALPHA FMUL xsum1, atemp1, a1 FMUL xsum2, atemp1, a2 @@ -1288,7 +1288,7 @@ LL(28): LFD y01, 0 * SIZE(YY) LFD y02, 1 * SIZE(YY) - LFD xtemp1, ALPHA + LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 @@ -1314,7 +1314,7 @@ LL(30): LFD atemp1, 0 * SIZE(XX) LFD a1, 0 * SIZE(AO1) - LFD xtemp1, ALPHA + LFD xtemp1, ALPHA LFD y01, 0 * SIZE(YY) FMUL xsum1, atemp1, a1 diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index 76cbd6461..ec1aeea39 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -280,7 +280,7 @@ li PREA, PREFETCHSIZE_A * SIZE sub IS, M, IS - + cmpwi cr0, M, 0 ble- LL(999) @@ -390,7 +390,7 @@ LL(11): slwi TEMP, IS, BASE_SHIFT add TEMP, X, TEMP - LFD a16, ALPHA + LFD a16, ALPHA lfd xsum1, FZERO LFD atemp1, 0 * SIZE(TEMP) @@ -484,7 +484,7 @@ LL(12): FMADD y04, atemp2, a8, y04 # DCBT(X, PREX) NOP2 - + FMADD xsum1, xtemp3, a3, xsum1 LFD a3, 6 * SIZE(AO1) FMADD y01, atemp3, a9, y01 @@ -1106,7 +1106,7 @@ LL(15): .align 4 LL(18): - LFD xtemp1, ALPHA + LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 @@ -1163,7 +1163,7 @@ LL(20): LFD atemp1, 0 * SIZE(TEMP) LFD atemp2, 1 * SIZE(TEMP) - LFD a1, ALPHA + LFD a1, ALPHA FMUL atemp1, a1, atemp1 FMUL atemp2, a1, atemp2 @@ -1228,7 +1228,7 @@ LL(22): .align 4 LL(28): - LFD xtemp1, ALPHA + LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 FMUL xsum2, xtemp1, xsum2 @@ -1246,7 +1246,7 @@ LL(28): addi IS, IS, 2 .align 4 - + LL(30): andi. TEMP, M, 1 ble LL(990) @@ -1258,7 +1258,7 @@ LL(30): LFD atemp1, 0 * SIZE(TEMP) - LFD a1, ALPHA + LFD a1, ALPHA FMUL atemp1, a1, atemp1 @@ -1299,7 +1299,7 @@ LL(32): .align 4 LL(38): - LFD xtemp1, ALPHA + LFD xtemp1, ALPHA FMUL xsum1, xtemp1, xsum1 diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S index 6be8e286d..0c13a25a4 100644 --- a/kernel/power/trsm_kernel_LN.S +++ b/kernel/power/trsm_kernel_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -238,7 +238,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -863,7 +863,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1370,7 +1370,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1410,7 +1410,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1802,7 +1802,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1881,7 +1881,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + #if defined(LN) || defined(RT) mr AORIG, A #else @@ -2608,7 +2608,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3448,7 +3448,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S index 0d287440b..06481e5e9 100644 --- a/kernel/power/trsm_kernel_LT.S +++ b/kernel/power/trsm_kernel_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -259,7 +259,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -367,7 +367,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -639,7 +639,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -679,7 +679,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1071,7 +1071,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1292,7 +1292,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1904,7 +1904,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -2117,7 +2117,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3061,7 +3061,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S index 533f29953..1777ba86d 100644 --- a/kernel/power/trsm_kernel_RT.S +++ b/kernel/power/trsm_kernel_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -256,7 +256,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -511,7 +511,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 @@ -1100,7 +1100,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -1313,7 +1313,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2106,7 +2106,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -2378,7 +2378,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -2418,7 +2418,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2810,7 +2810,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -3031,7 +3031,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S index 179db31d2..b5ed925ed 100644 --- a/kernel/power/trsm_kernel_cell_LN.S +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -233,7 +233,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -862,7 +862,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1166,7 +1166,7 @@ LL(11): dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC - + srawi. r0, TEMP, 2 mtspr CTR, r0 #endif @@ -1384,7 +1384,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1424,7 +1424,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1816,7 +1816,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1895,7 +1895,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + #if defined(LN) || defined(RT) mr AORIG, A #else @@ -2622,7 +2622,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3462,7 +3462,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S index 06b3d9ef1..cdc6f7514 100644 --- a/kernel/power/trsm_kernel_cell_LT.S +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -259,7 +259,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -367,7 +367,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -405,7 +405,7 @@ LL(11): dcbtst CO2, PREC dcbtst CO3, PREC dcbtst CO4, PREC - + srawi. r0, KK, 2 mtspr CTR, r0 mr BO, B @@ -654,7 +654,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -694,7 +694,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1086,7 +1086,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1307,7 +1307,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1919,7 +1919,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -2132,7 +2132,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3076,7 +3076,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S index 51e7bc48b..731f52c19 100644 --- a/kernel/power/trsm_kernel_cell_RT.S +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -233,7 +233,7 @@ #ifdef linux #ifndef __64BIT__ - mr PREA, r10 + mr PREA, r10 lwz PREB, 8 + STACKSIZE(SP) lwz PREC, 12 + STACKSIZE(SP) #else @@ -492,7 +492,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 @@ -1081,7 +1081,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -1294,7 +1294,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2087,7 +2087,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -2374,7 +2374,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -2414,7 +2414,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2806,7 +2806,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -3027,7 +3027,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S index 32f4d0d73..109dacb8c 100644 --- a/kernel/power/trsm_kernel_hummer_LN.S +++ b/kernel/power/trsm_kernel_hummer_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define ALPHA 0 #define FZERO 8 @@ -70,7 +70,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define CO3 r30 @@ -122,7 +122,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -272,7 +272,7 @@ mtspr CTR, r0 ble .L44 #endif - + LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 @@ -1774,7 +1774,7 @@ fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 - nop + nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 @@ -4597,7 +4597,7 @@ mtspr CTR, r0 ble .L114 #endif - + LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 @@ -5428,7 +5428,7 @@ fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 - fsmfp f3, f7 + fsmfp f3, f7 #endif #ifdef LT @@ -5528,7 +5528,7 @@ fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 - fsmfp f3, f7 + fsmfp f3, f7 #endif #ifdef RN diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S index 027fcf0f1..1ad062a7c 100644 --- a/kernel/power/trsm_kernel_hummer_LT.S +++ b/kernel/power/trsm_kernel_hummer_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define ALPHA 0 #define FZERO 8 @@ -70,7 +70,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define CO3 r30 @@ -122,7 +122,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -484,7 +484,7 @@ fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 - nop + nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 @@ -2465,7 +2465,7 @@ mtspr CTR, r0 ble .L44 #endif - + LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 @@ -4667,7 +4667,7 @@ fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 - fsmfp f3, f7 + fsmfp f3, f7 #endif #ifdef LT @@ -4767,7 +4767,7 @@ fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 - fsmfp f3, f7 + fsmfp f3, f7 #endif #ifdef RN @@ -5230,7 +5230,7 @@ mtspr CTR, r0 ble .L114 #endif - + LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S index e0b5d21f8..94b3c0c85 100644 --- a/kernel/power/trsm_kernel_hummer_RT.S +++ b/kernel/power/trsm_kernel_hummer_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define ALPHA 0 #define FZERO 8 @@ -70,7 +70,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define CO3 r30 @@ -122,7 +122,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -527,7 +527,7 @@ fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 - fsmfp f3, f7 + fsmfp f3, f7 #endif #ifdef LT @@ -627,7 +627,7 @@ fsmfp f0, f4 fsmfp f1, f5 fsmfp f2, f6 - fsmfp f3, f7 + fsmfp f3, f7 #endif #ifdef RN @@ -1090,7 +1090,7 @@ mtspr CTR, r0 ble .L114 #endif - + LFPDUX A1, AO, INC2 LFPDUX A2, AO, INC2 LFPDUX B1, BO, INC2 @@ -3355,7 +3355,7 @@ fxcsmadd f7, B6, A4, f7 LFPDUX A9, AO, INC4 fxcpmadd f11, B4, A4, f11 - nop + nop fxcsmadd f15, B4, A4, f15 bdnz+ .L12 .align 4 @@ -5336,7 +5336,7 @@ mtspr CTR, r0 ble .L44 #endif - + LFPDUX A1, AO, INC4 LFPDUX B1, BO, INC4 LFPDUX B2, BO2, INC4 diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S index 60ba58780..2f85cd14b 100644 --- a/kernel/power/trsm_kernel_power6_LN.S +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -794,7 +794,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1406,7 +1406,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1446,7 +1446,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1838,7 +1838,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1917,7 +1917,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + #if defined(LN) || defined(RT) mr AORIG, A #else @@ -2644,7 +2644,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3484,7 +3484,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S index 448b16369..6b3d21b14 100644 --- a/kernel/power/trsm_kernel_power6_LT.S +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -273,7 +273,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -650,7 +650,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -690,7 +690,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1082,7 +1082,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1303,7 +1303,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1915,7 +1915,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -2128,7 +2128,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3072,7 +3072,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S index 1f36d17df..f6b2e5cfb 100644 --- a/kernel/power/trsm_kernel_power6_RT.S +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -423,7 +423,7 @@ LL(78): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 @@ -1012,7 +1012,7 @@ LL(40): fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -1225,7 +1225,7 @@ LL(48): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2018,7 +2018,7 @@ LL(10): fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -2395,7 +2395,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -2435,7 +2435,7 @@ LL(18): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2827,7 +2827,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -3048,7 +3048,7 @@ LL(28): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S index 43354c690..265e79e0f 100644 --- a/kernel/power/trsm_kernel_ppc440_LN.S +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -786,7 +786,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1286,7 +1286,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1326,7 +1326,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1718,7 +1718,7 @@ addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1797,7 +1797,7 @@ fmr f5, f0 fmr f6, f0 fmr f7, f0 - + #if defined(LN) || defined(RT) mr AORIG, A #else @@ -2482,7 +2482,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3284,7 +3284,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S index eb0d4e413..de7ff7415 100644 --- a/kernel/power/trsm_kernel_ppc440_LT.S +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -284,7 +284,7 @@ fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -548,7 +548,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -588,7 +588,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -980,7 +980,7 @@ addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -1198,7 +1198,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1796,7 +1796,7 @@ fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -1989,7 +1989,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2893,7 +2893,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S index 54c59c269..e8d202d97 100644 --- a/kernel/power/trsm_kernel_ppc440_RT.S +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -416,7 +416,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f2, f18, f2 @@ -985,7 +985,7 @@ fmr f5, f0 fmr f6, f0 fmr f7, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) mr AORIG, A @@ -1178,7 +1178,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1949,7 +1949,7 @@ fmr f13, f0 fmr f14, f0 fmr f15, f0 - + srawi. I, M, 2 #if defined(LN) || defined(RT) @@ -2213,7 +2213,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -2253,7 +2253,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -2645,7 +2645,7 @@ addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 2 + BASE_SHIFT add AORIG, AORIG, r0 @@ -2863,7 +2863,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) diff --git a/kernel/power/zamax.S b/kernel/power/zamax.S index 6acd96dcc..2c1e3b769 100644 --- a/kernel/power/zamax.S +++ b/kernel/power/zamax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 #define INCXM1 r9 diff --git a/kernel/power/zamax_cell.S b/kernel/power/zamax_cell.S index 2af3d2411..a693d5f1b 100644 --- a/kernel/power/zamax_cell.S +++ b/kernel/power/zamax_cell.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 #define INCXM1 r9 diff --git a/kernel/power/zamax_hummer.S b/kernel/power/zamax_hummer.S index 84312395c..0f9705333 100644 --- a/kernel/power/zamax_hummer.S +++ b/kernel/power/zamax_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 @@ -86,7 +86,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/zamax_ppc440.S b/kernel/power/zamax_ppc440.S index 17372bbba..276ef421b 100644 --- a/kernel/power/zamax_ppc440.S +++ b/kernel/power/zamax_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREX r8 #define INC1 r9 @@ -86,7 +86,7 @@ sub X, X, INCX li INC1, SIZE - + cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 diff --git a/kernel/power/zamin.S b/kernel/power/zamin.S index 1ab8b6b39..52168e4b5 100644 --- a/kernel/power/zamin.S +++ b/kernel/power/zamin.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 #define INCXM1 r9 diff --git a/kernel/power/zamin_cell.S b/kernel/power/zamin_cell.S index 6d32f60c8..45e40050f 100644 --- a/kernel/power/zamin_cell.S +++ b/kernel/power/zamin_cell.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 #define INCXM1 r9 diff --git a/kernel/power/zamin_hummer.S b/kernel/power/zamin_hummer.S index 5ac1b8960..ff685b6ce 100644 --- a/kernel/power/zamin_hummer.S +++ b/kernel/power/zamin_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 @@ -86,7 +86,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/zamin_ppc440.S b/kernel/power/zamin_ppc440.S index 9d70f7608..60888a897 100644 --- a/kernel/power/zamin_ppc440.S +++ b/kernel/power/zamin_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREX r8 #define INC1 r9 diff --git a/kernel/power/zasum.S b/kernel/power/zasum.S index 14b58ce1a..e49011f39 100644 --- a/kernel/power/zasum.S +++ b/kernel/power/zasum.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCXM1 r9 #define PREA r8 diff --git a/kernel/power/zasum_cell.S b/kernel/power/zasum_cell.S index 7389468f3..111285d13 100644 --- a/kernel/power/zasum_cell.S +++ b/kernel/power/zasum_cell.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define PREA r8 #define INCXM1 r9 diff --git a/kernel/power/zasum_hummer.S b/kernel/power/zasum_hummer.S index f090e69f4..13b69701b 100644 --- a/kernel/power/zasum_hummer.S +++ b/kernel/power/zasum_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -73,7 +73,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) diff --git a/kernel/power/zasum_ppc440.S b/kernel/power/zasum_ppc440.S index 213c837bb..dd00c54f8 100644 --- a/kernel/power/zasum_ppc440.S +++ b/kernel/power/zasum_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCXM1 r9 #define PREX r8 diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S index 7eb591d1b..1acd729ae 100644 --- a/kernel/power/zaxpy.S +++ b/kernel/power/zaxpy.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifdef linux #ifndef __64BIT__ #define N r3 @@ -53,7 +53,7 @@ #else #define N r3 #define X r8 -#define INCX r9 +#define INCX r9 #define Y r10 #define INCY r4 #define INCXM1 r5 @@ -152,9 +152,9 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif - + cmpwi cr0, N, 0 ble- LL(999) diff --git a/kernel/power/zaxpy_hummer.S b/kernel/power/zaxpy_hummer.S index 41b34954e..23e702ee8 100644 --- a/kernel/power/zaxpy_hummer.S +++ b/kernel/power/zaxpy_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 #define INCY r9 @@ -481,7 +481,7 @@ LL(117): LL(999): li r10, 16 subi SP, SP, 16 - + lfpdux f25, SP, r10 lfpdux f24, SP, r10 lfpdux f23, SP, r10 diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S index 5100e9442..1ac232444 100644 --- a/kernel/power/zaxpy_ppc440.S +++ b/kernel/power/zaxpy_ppc440.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifdef linux #ifndef __64BIT__ #define N r3 @@ -51,7 +51,7 @@ #else #define N r3 #define X r8 -#define INCX r9 +#define INCX r9 #define Y r5 #define INCY r4 #define YY r6 @@ -96,7 +96,7 @@ PROFCODE subi SP, SP, STACKSIZE - + stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) diff --git a/kernel/power/zcopy.S b/kernel/power/zcopy.S index f5ed2f99d..b75550370 100644 --- a/kernel/power/zcopy.S +++ b/kernel/power/zcopy.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 @@ -69,7 +69,7 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 diff --git a/kernel/power/zcopy_hummer.S b/kernel/power/zcopy_hummer.S index 825b44016..cbf4acda0 100644 --- a/kernel/power/zcopy_hummer.S +++ b/kernel/power/zcopy_hummer.S @@ -38,12 +38,12 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 -#define INCY r7 +#define INCY r7 #define INCX2 r8 #define INCY2 r9 @@ -75,7 +75,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX @@ -277,7 +277,7 @@ LL(23): LL(25): andi. r0, N, 7 beq LL(29) - + andi. r0, N, 4 beq LL(26) @@ -416,7 +416,7 @@ LL(33): LL(35): andi. r0, N, 7 beq LL(999) - + andi. r0, N, 4 beq LL(36) @@ -645,7 +645,7 @@ LL(999): lfpdux f15, SP, r10 lfpdux f14, SP, r10 - + addi SP, SP, 16 blr diff --git a/kernel/power/zdot.S b/kernel/power/zdot.S index dab7eaa49..f6a68aab4 100644 --- a/kernel/power/zdot.S +++ b/kernel/power/zdot.S @@ -38,19 +38,19 @@ #define ASSEMBLER #include "common.h" - + #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 -#define INCX r6 +#define INCX r6 #define Y r7 #define INCY r8 #define PREA r9 #else #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 @@ -127,7 +127,7 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 diff --git a/kernel/power/zdot_cell.S b/kernel/power/zdot_cell.S index 66b7dfa09..1fe15df9b 100644 --- a/kernel/power/zdot_cell.S +++ b/kernel/power/zdot_cell.S @@ -38,19 +38,19 @@ #define ASSEMBLER #include "common.h" - + #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 -#define INCX r6 +#define INCX r6 #define Y r7 #define INCY r8 #define PREA r9 #else #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 @@ -286,7 +286,7 @@ LL(20): addi Y, Y, 16 * SIZE FMADD f2, f9, f16, f2 nop - + FMADD f4, f10, f18, f4 FMADD f7, f10, f19, f7 FMADD f5, f11, f19, f5 diff --git a/kernel/power/zdot_hummer.S b/kernel/power/zdot_hummer.S index 83027cfd6..fa5003cd1 100644 --- a/kernel/power/zdot_hummer.S +++ b/kernel/power/zdot_hummer.S @@ -38,18 +38,18 @@ #define ASSEMBLER #include "common.h" - + #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 -#define INCX r6 +#define INCX r6 #define Y r7 #define INCY r8 #else #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #endif @@ -97,7 +97,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 diff --git a/kernel/power/zdot_ppc440.S b/kernel/power/zdot_ppc440.S index 3340e6596..490418cfb 100644 --- a/kernel/power/zdot_ppc440.S +++ b/kernel/power/zdot_ppc440.S @@ -38,19 +38,19 @@ #define ASSEMBLER #include "common.h" - + #if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) #define RESULT r3 #define N r4 #define X r5 -#define INCX r6 +#define INCX r6 #define Y r7 #define INCY r8 #define PRE r9 #else #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PRE r8 diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index c936a3d43..4a9cbd8bb 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define C r10 @@ -86,11 +86,11 @@ #endif #endif - + slwi LDC, LDC, ZBASE_SHIFT lfs f0, 16(SP) - + fmr ALPHA_R, f1 fmr ALPHA_I, f2 @@ -138,7 +138,7 @@ LL(12): addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 - + LL(15): andi. r0, M, 7 mtspr CTR, r0 @@ -211,7 +211,7 @@ LL(22): dcbtst PRE, CO1 bdnz LL(22) .align 4 - + LL(25): andi. r0, M, 3 mtspr CTR, r0 diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S index 5fef0da3d..3d6689531 100644 --- a/kernel/power/zgemm_kernel.S +++ b/kernel/power/zgemm_kernel.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -105,7 +105,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM PROLOGUE @@ -335,7 +335,7 @@ LL(11): LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) - + #ifdef POWER5 LFD f28, 4 * SIZE(B) LFD f29, 5 * SIZE(B) @@ -564,7 +564,7 @@ LL(12): LFD f30, 22 * SIZE(BO) LFD f31, 23 * SIZE(BO) #endif - + addi AO, AO, 16 * SIZE addi BO, BO, 16 * SIZE @@ -831,7 +831,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S index b55300ef6..2267e975a 100644 --- a/kernel/power/zgemm_kernel_altivec.S +++ b/kernel/power/zgemm_kernel_altivec.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -1624,7 +1624,7 @@ LL(98): STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) .align 4 - + LL(999): mr SP, STACK diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S index 7b80e6601..9a1407d6e 100644 --- a/kernel/power/zgemm_kernel_altivec_cell.S +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -1779,7 +1779,7 @@ LL(98): STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) .align 4 - + LL(999): mr SP, STACK diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S index f82734833..4c774a1e3 100644 --- a/kernel/power/zgemm_kernel_altivec_g4.S +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -1678,7 +1678,7 @@ LL(98): STFD f8, 0 * SIZE(CO1) STFD f9, 1 * SIZE(CO1) .align 4 - + LL(999): mr SP, STACK diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S index f0d32048b..5667b130d 100644 --- a/kernel/power/zgemm_kernel_cell.S +++ b/kernel/power/zgemm_kernel_cell.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -105,7 +105,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM #ifndef DOUBLE @@ -778,7 +778,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S index c652adf8a..af6f88e99 100644 --- a/kernel/power/zgemm_kernel_g4.S +++ b/kernel/power/zgemm_kernel_g4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -660,7 +660,7 @@ addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S index 7378950e8..991a64373 100644 --- a/kernel/power/zgemm_kernel_hummer.S +++ b/kernel/power/zgemm_kernel_hummer.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #undef ZERO #define ALPHA 0 @@ -72,7 +72,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define ZERO r31 @@ -131,7 +131,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -201,7 +201,7 @@ #endif addi AO, A, -4 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -1435,7 +1435,7 @@ #endif addi AO, A, -2 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -2273,7 +2273,7 @@ #endif addi AO, A, -4 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 @@ -2629,7 +2629,7 @@ #endif FXCSMADD f13, B2, A2, f13 nop - + FXCPMADD f2, B5, A8, f2 nop FXCSMADD f6, B5, A8, f6 @@ -3576,7 +3576,7 @@ #endif addi AO, A, -2 * SIZE - + li r0, FZERO lfpsx f0, SP, r0 diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S index 716fa885f..d7d6e2aea 100644 --- a/kernel/power/zgemm_kernel_power3.S +++ b/kernel/power/zgemm_kernel_power3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -99,7 +99,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM #ifndef DOUBLE @@ -291,7 +291,7 @@ LL(KERNEL_MainSubHead): LFD f21, 1 * SIZE(B) LFD f22, 2 * SIZE(B) LFD f23, 3 * SIZE(B) - + LFD f24, 4 * SIZE(AO) LFD f25, 5 * SIZE(AO) LFD f26, 6 * SIZE(AO) @@ -601,7 +601,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE - + addic. I, I, -1 bgt LL(KERNEL_MainSubHead) .align 4 diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 7f677dfec..3f79c0523 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -107,7 +107,7 @@ #define PREA r30 #define PREC r31 - + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FMA1 FMADD #define FMA2 FMADD @@ -1056,7 +1056,7 @@ LL(18): addi CO2, CO2, 4 * SIZE addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -1445,7 +1445,7 @@ LL(28): addi CO2, CO2, 2 * SIZE addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -2016,7 +2016,7 @@ LL(38): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -2271,7 +2271,7 @@ LL(48): addi CO1, CO1, 2 * SIZE addi CO2, CO2, 2 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -2644,7 +2644,7 @@ LL(58): fmr f11, f0 addi CO1, CO1, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) @@ -2847,7 +2847,7 @@ LL(68): STFD f1, 1 * SIZE(CO1) addi CO1, CO1, 2 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S index 2a80c97f8..075fa2b4f 100644 --- a/kernel/power/zgemm_kernel_ppc440.S +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -723,7 +723,7 @@ addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE - + #ifdef TRMMKERNEL #if ( defined(LEFT) && defined(TRANSA)) || \ (!defined(LEFT) && !defined(TRANSA)) diff --git a/kernel/power/zgemm_ncopy_hummer_2.S b/kernel/power/zgemm_ncopy_hummer_2.S index 9a6f80269..8a2ac935a 100644 --- a/kernel/power/zgemm_ncopy_hummer_2.S +++ b/kernel/power/zgemm_ncopy_hummer_2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -77,7 +77,7 @@ stfpdux f14, SP, r0 stfpdux f15, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) @@ -146,7 +146,7 @@ LL(12): STFPDUX c16, B, INC2 bdnz LL(12) .align 4 - + LL(15): andi. r0, M, 7 ble LL(19) @@ -227,7 +227,7 @@ LL(22): STFPDUX c07, B, INC2 bdnz LL(22) .align 4 - + LL(25): andi. r0, M, 3 ble LL(99) @@ -321,7 +321,7 @@ LL(112): STFPDUX c15, B, INC2 bdnz LL(112) .align 4 - + LL(115): andi. r0, M, 3 ble LL(119) @@ -404,7 +404,7 @@ LL(122): STFPDUX c07, B, INC2 bdnz LL(122) .align 4 - + LL(125): andi. r0, M, 3 ble LL(999) diff --git a/kernel/power/zgemm_ncopy_hummer_4.S b/kernel/power/zgemm_ncopy_hummer_4.S index 0a64d0d05..e3c4b7192 100644 --- a/kernel/power/zgemm_ncopy_hummer_4.S +++ b/kernel/power/zgemm_ncopy_hummer_4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -79,7 +79,7 @@ stfpdux f14, SP, r0 stfpdux f15, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) @@ -154,7 +154,7 @@ LL(12): STFPDUX c16, B, INC2 bdnz LL(12) .align 4 - + LL(15): andi. r0, M, 3 ble LL(19) @@ -237,7 +237,7 @@ LL(22): STFPDUX c08, B, INC2 bdnz LL(22) .align 4 - + LL(25): andi. r0, M, 3 ble LL(30) @@ -290,7 +290,7 @@ LL(32): STFPDUX c04, B, INC2 bdnz LL(32) .align 4 - + LL(35): andi. r0, M, 3 ble LL(99) @@ -406,7 +406,7 @@ LL(112): STFPDUX c16, B, INC2 bdnz LL(112) .align 4 - + LL(115): andi. r0, M, 3 ble LL(119) @@ -536,7 +536,7 @@ LL(122): STFPDUX c15, B, INC2 bdnz LL(122) .align 4 - + LL(125): andi. r0, M, 3 ble LL(130) @@ -614,7 +614,7 @@ LL(132): STFPDUX c07, B, INC2 bdnz LL(132) .align 4 - + LL(135): andi. r0, M, 3 ble LL(999) diff --git a/kernel/power/zgemm_tcopy_hummer_2.S b/kernel/power/zgemm_tcopy_hummer_2.S index bc2a083cf..d5dea2f24 100644 --- a/kernel/power/zgemm_tcopy_hummer_2.S +++ b/kernel/power/zgemm_tcopy_hummer_2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -55,7 +55,7 @@ #define M4 r29 #define INC r30 #define INC2 r31 - + #define c01 f0 #define c02 f1 #define c03 f2 @@ -130,7 +130,7 @@ LL(12): STFPDUX c04, B1, INC2 bdnz LL(12) .align 4 - + LL(15): andi. r0, N, 1 ble LL(19) @@ -168,7 +168,7 @@ LL(22): STFPDUX c02, B1, INC2 bdnz LL(22) .align 4 - + LL(23): andi. r0, N, 1 ble LL(99) @@ -230,7 +230,7 @@ LL(112): STFPDUX c07, B1, INC2 bdnz LL(112) .align 4 - + LL(115): andi. r0, N, 1 ble LL(119) @@ -278,7 +278,7 @@ LL(122): STFPDUX c03, B1, INC2 bdnz LL(122) .align 4 - + LL(123): andi. r0, N, 1 ble LL(999) diff --git a/kernel/power/zgemm_tcopy_hummer_4.S b/kernel/power/zgemm_tcopy_hummer_4.S index 7011dc2d8..aae4e73c0 100644 --- a/kernel/power/zgemm_tcopy_hummer_4.S +++ b/kernel/power/zgemm_tcopy_hummer_4.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M r3 #define N r4 #define A r5 @@ -57,7 +57,7 @@ #define M4 r29 #define INC r30 #define INC2 r31 - + #define c01 f0 #define c02 f1 #define c03 f2 @@ -184,7 +184,7 @@ LL(12): STFPDUX c16, B1, INC2 bdnz LL(12) .align 4 - + LL(15): andi. r0, N, 3 ble LL(19) @@ -271,7 +271,7 @@ LL(22): STFPDUX c08, B1, INC2 bdnz LL(22) .align 4 - + LL(23): andi. r0, N, 2 ble LL(24) @@ -323,7 +323,7 @@ LL(32): STFPDUX c04, B1, INC2 bdnz LL(32) .align 4 - + LL(33): andi. r0, N, 2 ble LL(34) @@ -447,7 +447,7 @@ LL(112): STFPDUX c16, B1, INC2 bdnz LL(112) .align 4 - + LL(115): andi. r0, N, 3 ble LL(119) @@ -576,7 +576,7 @@ LL(122): STFPDUX c15, B1, INC2 bdnz LL(122) .align 4 - + LL(123): andi. r0, N, 2 ble LL(124) @@ -651,7 +651,7 @@ LL(132): STFPDUX c07, B1, INC2 bdnz LL(132) .align 4 - + LL(133): andi. r0, N, 2 ble LL(134) diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 00ba966ac..ba4685dec 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -82,7 +82,7 @@ #define INCY r7 #endif #endif - + #define I r11 #define J r12 @@ -2155,7 +2155,7 @@ LL(37): add Y2, Y2, INCY b LL(999) .align 4 - + LL(100): srawi. J, N, 2 ble LL(120) diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S index 690eb0d46..31e720261 100644 --- a/kernel/power/zgemv_n_ppc440.S +++ b/kernel/power/zgemv_n_ppc440.S @@ -82,7 +82,7 @@ #define INCY r7 #endif #endif - + #define I r11 #define J r12 @@ -1241,7 +1241,7 @@ LL(37): STFDU y01, 1 * SIZE(Y2) STFDU y02, 1 * SIZE(Y2) .align 4 - + LL(990): cmpi cr0, 0, INCY, SIZE beq LL(999) diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index 057c04d62..bd8ac4043 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -257,7 +257,7 @@ stfd f1, ALPHA_R stfd f2, ALPHA_I - + mullw PLDA_M, LDA, N li XP, P subf PLDA_M, XP, PLDA_M @@ -669,7 +669,7 @@ LL(MainKernel): addi BO, BO, 16 * SIZE bdnz LL(MainKernel) - .align 4 + .align 4 LL(MainKernelSkip): FMADD f0, f16, f24, f0 @@ -984,7 +984,7 @@ LL(MainN3Kernel): addi AO4, AO4, 2 * SIZE bdnz LL(MainN3Kernel) - .align 4 + .align 4 LL(MainN3KernelSkip): FMADD f0, f16, f24, f0 @@ -1159,7 +1159,7 @@ LL(FinishN1): cmpwi cr0, J, 0 bgt LL(MainHead) .align 4 - + LL(Remain): andi. J, N, 3 ble LL(ISEnd) @@ -1301,7 +1301,7 @@ LL(RemainKernel): DCBT(AO1, PREA) bdnz LL(RemainKernel) - .align 4 + .align 4 LL(RemainKernelSkip): FMADD f0, f16, f24, f0 @@ -1393,7 +1393,7 @@ LL(RemainN3Kernel): LFDU f25, 2 * SIZE(BO) addi AO1, AO1, 2 * SIZE bdnz LL(RemainN3Kernel) - .align 4 + .align 4 LL(RemainN3KernelSkip): FMADD f0, f16, f24, f0 diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S index edb5183fc..043b9e37b 100644 --- a/kernel/power/zgemv_t_ppc440.S +++ b/kernel/power/zgemv_t_ppc440.S @@ -542,7 +542,7 @@ LL(12): LFDU a7, 1 * SIZE(AO4) FMADD4 y8, a8, b3, y8 bdnz LL(12) - .align 4 + .align 4 LL(13): FMADD1 y1, a1, b1, y1 @@ -804,7 +804,7 @@ LL(19): cmpwi cr0, J, 0 bgt LL(11) .align 4 - + LL(20): andi. J, N, 2 ble LL(30) @@ -920,7 +920,7 @@ LL(22): FMADD4 y4, a4, b3, y4 bdnz LL(22) - .align 4 + .align 4 LL(23): FMADD1 y1, a1, b1, y1 @@ -1147,7 +1147,7 @@ LL(32): LFDU a2, 1 * SIZE(AO1) bdnz LL(32) - .align 4 + .align 4 LL(33): FMADD1 y1, a1, b1, y1 diff --git a/kernel/power/zger.S b/kernel/power/zger.S index 03d0bca7b..01cb90731 100644 --- a/kernel/power/zger.S +++ b/kernel/power/zger.S @@ -342,7 +342,7 @@ LL(06): addi X1, X1, 2 * SIZE bdnz+ LL(06) .align 4 - + LL(10): srawi. J, N, 1 ble LL(20) @@ -937,7 +937,7 @@ LL(19): cmpi cr0, 0, J, 0 bgt LL(11) .align 4 - + LL(20): andi. J, N, 1 ble LL(999) diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S index ded25fdd1..60f379d25 100644 --- a/kernel/power/znrm2.S +++ b/kernel/power/znrm2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define NN r6 #define XX r7 diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S index b6deb9447..1d0c598f8 100644 --- a/kernel/power/znrm2_hummer.S +++ b/kernel/power/znrm2_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define INCX2 r6 #define X2 r7 @@ -91,7 +91,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 - + stfpdux f16, SP, r10 stfpdux f17, SP, r10 stfpdux f18, SP, r10 @@ -309,7 +309,7 @@ LL(20): fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear - + fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 @@ -755,7 +755,7 @@ LL(120): fdiv ALPHA_R, ALPHA_R, ALPHA lfpsx C1, SP, r10 # Zero clear - + fpmr C2, C1 fpmr C3, C1 fpmr C4, C1 diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S index 354227917..778b805de 100644 --- a/kernel/power/znrm2_ppc440.S +++ b/kernel/power/znrm2_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define NN r6 #define XX r7 @@ -107,7 +107,7 @@ sub X, X, INCX li INC1, SIZE - li PRE, 3 * 16 * SIZE + li PRE, 3 * 16 * SIZE cmpwi cr0, N, 0 ble- LL(999) diff --git a/kernel/power/zrot.S b/kernel/power/zrot.S index aad28af05..3ec4277f6 100644 --- a/kernel/power/zrot.S +++ b/kernel/power/zrot.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PREA r8 @@ -88,7 +88,7 @@ srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(50) - + LFD f0, 0 * SIZE(X) LFD f4, 1 * SIZE(X) LFD f6, 2 * SIZE(X) diff --git a/kernel/power/zrot_ppc440.S b/kernel/power/zrot_ppc440.S index fe1a99dc5..abde97ef3 100644 --- a/kernel/power/zrot_ppc440.S +++ b/kernel/power/zrot_ppc440.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r4 -#define INCX r5 +#define INCX r5 #define Y r6 #define INCY r7 #define PRE r8 diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S index 7ffa80f19..2eb7b0df3 100644 --- a/kernel/power/zscal.S +++ b/kernel/power/zscal.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define XX r4 #define PREA r5 @@ -66,7 +66,7 @@ #define FZERO f0 #define ALPHA_R f1 #define ALPHA_I f2 - + PROLOGUE PROFCODE @@ -80,7 +80,7 @@ #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCX, 56(SP) #endif - + slwi INCX, INCX, ZBASE_SHIFT li PREA, L1_PREFETCHSIZE diff --git a/kernel/power/zscal_hummer.S b/kernel/power/zscal_hummer.S index 6c559f3f2..56fd5d182 100644 --- a/kernel/power/zscal_hummer.S +++ b/kernel/power/zscal_hummer.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define INCX2 r4 #define XX r5 @@ -78,7 +78,7 @@ stfpdux f15, SP, r10 stfpdux f16, SP, r10 stfpdux f17, SP, r10 - + li r10, 0 stwu r10, -4(SP) stwu r10, -4(SP) diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S index 9f120acfa..d0e4c9bcf 100644 --- a/kernel/power/zscal_ppc440.S +++ b/kernel/power/zscal_ppc440.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N r3 #define XX r4 #define PRE r5 @@ -68,7 +68,7 @@ #define FZERO f0 #define ALPHA_R f1 #define ALPHA_I f2 - + PROLOGUE PROFCODE @@ -82,7 +82,7 @@ #if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) lwz INCX, 56(SP) #endif - + slwi INCX, INCX, ZBASE_SHIFT li INC1, SIZE sub X, X, INCX diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S index 4c23c1d5e..048e8ac5f 100644 --- a/kernel/power/zswap.S +++ b/kernel/power/zswap.S @@ -38,12 +38,12 @@ #define ASSEMBLER #include "common.h" - + #ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 #define INCY r9 #define PREA r4 @@ -52,7 +52,7 @@ #else #define N r3 #define X r8 -#define INCX r9 +#define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 @@ -65,7 +65,7 @@ #if !defined(__64BIT__) && defined(DOUBLE) #define N r3 #define X r10 -#define INCX r4 +#define INCX r4 #define Y r5 #define INCY r6 #define PREA r7 @@ -74,7 +74,7 @@ #else #define N r3 #define X r8 -#define INCX r9 +#define INCX r9 #define Y r10 #define INCY r4 #define PREA r5 @@ -120,7 +120,7 @@ #if defined(linux) && defined(__64BIT__) ld INCY, 112 + STACKSIZE(SP) #endif - + #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ ld INCY, 112 + STACKSIZE(SP) @@ -143,7 +143,7 @@ #ifdef L1_DUALFETCH li PREA, (L1_PREFETCHSIZE) / 2 #else - li PREA, (L1_PREFETCHSIZE) + li PREA, (L1_PREFETCHSIZE) #endif cmpwi cr0, N, 0 diff --git a/kernel/power/zswap_hummer.S b/kernel/power/zswap_hummer.S index 335eaa11c..3674cdc11 100644 --- a/kernel/power/zswap_hummer.S +++ b/kernel/power/zswap_hummer.S @@ -41,9 +41,9 @@ #define N r3 #define X r6 -#define INCX r7 +#define INCX r7 #define Y r8 -#define INCY r9 +#define INCY r9 #define INCX2 r4 #define INCY2 r5 @@ -78,7 +78,7 @@ stfpdux f14, SP, r10 stfpdux f15, SP, r10 stfpdux f16, SP, r10 - + slwi INCX, INCX, BASE_SHIFT slwi INCY, INCY, BASE_SHIFT add INCX2, INCX, INCX @@ -283,7 +283,7 @@ LL(23): LL(25): andi. r0, N, 3 beq LL(29) - + andi. r0, N, 2 beq LL(27) @@ -428,7 +428,7 @@ LL(33): LL(35): andi. r0, N, 3 beq LL(39) - + andi. r0, N, 2 beq LL(37) @@ -658,7 +658,7 @@ LL(999): lfpdux f16, SP, r10 lfpdux f15, SP, r10 lfpdux f14, SP, r10 - + addi SP, SP, 16 blr diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index 0dca84d51..ad4a8cd5c 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -100,7 +100,7 @@ #define TEMP r22 #define PREA r24 #define IS r25 - + #define y01 f0 #define y02 f1 #define y03 f2 @@ -1455,7 +1455,7 @@ LL(18): STFD y04, 3 * SIZE(YY) ble LL(11) .align 4 - + LL(20): andi. TEMP, N, 1 ble LL(990) @@ -1505,7 +1505,7 @@ LL(20): STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) .align 4 - + LL(990): cmpwi cr0, INCY, 2 * SIZE beq LL(999) diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index dbf6ebb1d..4032b66bb 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -295,7 +295,7 @@ li PREA, PREFETCHSIZE_A * SIZE sub IS, M, IS - + cmpwi cr0, M, 0 ble- LL(999) @@ -1393,7 +1393,7 @@ LL(18): STFD y04, 3 * SIZE(YY) ble LL(11) .align 4 - + LL(20): andi. TEMP, M, 1 ble LL(990) @@ -1485,7 +1485,7 @@ LL(28): STFD y01, 0 * SIZE(YY) STFD y02, 1 * SIZE(YY) .align 4 - + LL(990): cmpwi cr0, INCY, 2 * SIZE beq LL(999) diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S index e31a887bc..64fb96823 100644 --- a/kernel/power/ztrsm_kernel_LN.S +++ b/kernel/power/ztrsm_kernel_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -106,7 +106,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM PROLOGUE @@ -1074,7 +1074,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1094,7 +1094,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1431,7 +1431,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S index f7153b789..ae4615cf5 100644 --- a/kernel/power/ztrsm_kernel_LT.S +++ b/kernel/power/ztrsm_kernel_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -106,7 +106,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM PROLOGUE @@ -652,7 +652,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -672,7 +672,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1009,7 +1009,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S index 55bc29b1d..f756dda77 100644 --- a/kernel/power/ztrsm_kernel_RT.S +++ b/kernel/power/ztrsm_kernel_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -106,7 +106,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM PROLOGUE @@ -1049,7 +1049,7 @@ LL(49): .align 4 -LL(30): +LL(30): srawi. J, N, 1 ble LL(999) .align 4 @@ -1402,7 +1402,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1422,7 +1422,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1759,7 +1759,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S index c284a0ed7..2427a4ddd 100644 --- a/kernel/power/ztrsm_kernel_cell_LN.S +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -106,7 +106,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM #ifndef DOUBLE @@ -1038,7 +1038,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1058,7 +1058,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1395,7 +1395,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S index ca8010091..0d88ded9a 100644 --- a/kernel/power/ztrsm_kernel_cell_LT.S +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -106,7 +106,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM #ifndef DOUBLE @@ -641,7 +641,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -661,7 +661,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -998,7 +998,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S index f1139fd34..84f2089fa 100644 --- a/kernel/power/ztrsm_kernel_cell_RT.S +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -106,7 +106,7 @@ #define PREA r30 #define PREC r31 #define PREB PREA - + #ifndef NEEDPARAM #ifndef DOUBLE @@ -993,7 +993,7 @@ LL(49): .align 4 -LL(30): +LL(30): srawi. J, N, 1 ble LL(999) .align 4 @@ -1362,7 +1362,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1382,7 +1382,7 @@ LL(KERNEL_MainFinish): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1719,7 +1719,7 @@ LL(KERNEL_MainFinish): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S index 9e9697dac..bf3eafa45 100644 --- a/kernel/power/ztrsm_kernel_hummer_LN.S +++ b/kernel/power/ztrsm_kernel_hummer_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #undef ZERO #define ALPHA 0 @@ -73,7 +73,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define ZERO r31 @@ -145,7 +145,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -194,7 +194,7 @@ li INCM7, -7 * SIZE addi C, C, - 1 * SIZE - + #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT @@ -255,7 +255,7 @@ li r0, FZERO lfpsx f0, SP, r0 - + andi. I, M, 1 beq .L20 @@ -539,7 +539,7 @@ li r0, FZERO lfpsx f0, SP, r0 .align 4 - + .L20: andi. I, M, 2 beq .L30 diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S index 6da6c72ad..865c85f78 100644 --- a/kernel/power/ztrsm_kernel_hummer_LT.S +++ b/kernel/power/ztrsm_kernel_hummer_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #undef ZERO #define ALPHA 0 @@ -73,7 +73,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define ZERO r31 @@ -145,7 +145,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -194,7 +194,7 @@ li INCM7, -7 * SIZE addi C, C, - 1 * SIZE - + #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S index 8670ceac5..99868f948 100644 --- a/kernel/power/ztrsm_kernel_hummer_RT.S +++ b/kernel/power/ztrsm_kernel_hummer_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #undef ZERO #define ALPHA 0 @@ -73,7 +73,7 @@ #define BO r25 #define AO2 r26 #define BO2 r27 - + #define CO1 r28 #define CO2 r29 #define ZERO r31 @@ -145,7 +145,7 @@ stfpdux f29, SP, r0 stfpdux f30, SP, r0 stfpdux f31, SP, r0 - + stwu r31, -4(SP) stwu r30, -4(SP) stwu r29, -4(SP) @@ -194,7 +194,7 @@ li INCM7, -7 * SIZE addi C, C, - 1 * SIZE - + #ifdef LN mullw r0, M, K slwi r0, r0, ZBASE_SHIFT @@ -1266,7 +1266,7 @@ #endif .align 4 -.L50: +.L50: srawi. J, N, 1 ble .L999 .align 4 diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S index 7a3b28636..42239bb55 100644 --- a/kernel/power/ztrsm_kernel_power6_LN.S +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -103,7 +103,7 @@ #define PREA r30 #define PREC r31 - + #ifndef CONJ #define FMA1 FMADD #define FMA2 FMADD @@ -580,7 +580,7 @@ LL(27): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 @@ -1049,7 +1049,7 @@ LL(27): addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif - + #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 @@ -1677,7 +1677,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 @@ -1697,7 +1697,7 @@ LL(18): LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) - + FSUB f2, f24, f2 FSUB f3, f25, f3 FSUB f6, f26, f6 @@ -1724,7 +1724,7 @@ LL(18): FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1739,7 +1739,7 @@ LL(18): LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) - + FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 @@ -2574,7 +2574,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 @@ -3505,7 +3505,7 @@ LL(38): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -3525,7 +3525,7 @@ LL(38): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3862,7 +3862,7 @@ LL(38): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S index b7c34419b..dfae4d60b 100644 --- a/kernel/power/ztrsm_kernel_power6_LT.S +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -103,7 +103,7 @@ #define PREA r30 #define PREC r31 - + #ifndef CONJ #define FMA1 FMADD #define FMA2 FMADD @@ -914,7 +914,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 @@ -934,7 +934,7 @@ LL(18): LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) - + FSUB f2, f24, f2 FSUB f3, f25, f3 FSUB f6, f26, f6 @@ -961,7 +961,7 @@ LL(18): FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -976,7 +976,7 @@ LL(18): LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) - + FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 @@ -1811,7 +1811,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 @@ -2107,7 +2107,7 @@ LL(27): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 @@ -2556,7 +2556,7 @@ LL(27): addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif - + #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 @@ -3060,7 +3060,7 @@ LL(38): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -3080,7 +3080,7 @@ LL(38): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3417,7 +3417,7 @@ LL(38): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S index 069a73c21..79f8b70b8 100644 --- a/kernel/power/ztrsm_kernel_power6_RT.S +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -103,7 +103,7 @@ #define PREA r30 #define PREC r31 - + #ifndef CONJ #define FMA1 FMADD #define FMA2 FMADD @@ -1462,7 +1462,7 @@ LL(38): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1482,7 +1482,7 @@ LL(38): LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1819,7 +1819,7 @@ LL(38): addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 @@ -2945,7 +2945,7 @@ LL(18): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 @@ -2965,7 +2965,7 @@ LL(18): LFD f25, 9 * SIZE(BO) LFD f26, 10 * SIZE(BO) LFD f27, 11 * SIZE(BO) - + FSUB f2, f24, f2 FSUB f3, f25, f3 FSUB f6, f26, f6 @@ -2992,7 +2992,7 @@ LL(18): FSUB f1, f17, f1 FSUB f2, f18, f2 FSUB f3, f19, f3 - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -3007,7 +3007,7 @@ LL(18): LFD f25, 9 * SIZE(AO) LFD f26, 10 * SIZE(AO) LFD f27, 11 * SIZE(AO) - + FSUB f8, f24, f8 FSUB f9, f25, f9 FSUB f10, f26, f10 @@ -3842,7 +3842,7 @@ LL(18): addi CO3, CO3, 4 * SIZE addi CO4, CO4, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 @@ -4138,7 +4138,7 @@ LL(27): LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + FSUB f0, f16, f0 FSUB f1, f17, f1 FSUB f4, f18, f4 @@ -4587,7 +4587,7 @@ LL(27): addi CO3, CO3, 2 * SIZE addi CO4, CO4, 2 * SIZE #endif - + #ifdef RT slwi r0, K, 0 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S index fdcf5beb0..51db71903 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LN.S +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -1017,7 +1017,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1037,7 +1037,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1374,7 +1374,7 @@ addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S index a9c98dd30..b5e23b3c6 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LT.S +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -595,7 +595,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -615,7 +615,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -952,7 +952,7 @@ addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S index c9b794ef4..2bb374d22 100644 --- a/kernel/power/ztrsm_kernel_ppc440_RT.S +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #ifndef __64BIT__ #define LOAD lwz #else @@ -975,7 +975,7 @@ .align 4 -.L30: +.L30: srawi. J, N, 1 ble .L999 .align 4 @@ -1325,7 +1325,7 @@ LFD f17, 1 * SIZE(BO) LFD f18, 2 * SIZE(BO) LFD f19, 3 * SIZE(BO) - + LFD f20, 4 * SIZE(BO) LFD f21, 5 * SIZE(BO) LFD f22, 6 * SIZE(BO) @@ -1345,7 +1345,7 @@ LFD f17, 1 * SIZE(AO) LFD f18, 2 * SIZE(AO) LFD f19, 3 * SIZE(AO) - + LFD f20, 4 * SIZE(AO) LFD f21, 5 * SIZE(AO) LFD f22, 6 * SIZE(AO) @@ -1682,7 +1682,7 @@ addi CO1, CO1, 4 * SIZE addi CO2, CO2, 4 * SIZE #endif - + #ifdef RT slwi r0, K, 1 + ZBASE_SHIFT add AORIG, AORIG, r0 diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 5df9a0b75..806c1928c 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -69,9 +69,9 @@ gotoblas_t TABLE_NAME = { sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, - sgemm_kernelTS, sgemm_betaTS, + sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N - sgemm_incopyTS, sgemm_itcopyTS, + sgemm_incopyTS, sgemm_itcopyTS, #else sgemm_oncopyTS, sgemm_otcopyTS, #endif @@ -97,7 +97,7 @@ gotoblas_t TABLE_NAME = { strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N - ssymm_iutcopyTS, ssymm_iltcopyTS, + ssymm_iutcopyTS, ssymm_iltcopyTS, #else ssymm_outcopyTS, ssymm_oltcopyTS, #endif @@ -119,9 +119,9 @@ gotoblas_t TABLE_NAME = { dgemv_nTS, dgemv_tTS, dger_kTS, dsymv_LTS, dsymv_UTS, - dgemm_kernelTS, dgemm_betaTS, + dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N - dgemm_incopyTS, dgemm_itcopyTS, + dgemm_incopyTS, dgemm_itcopyTS, #else dgemm_oncopyTS, dgemm_otcopyTS, #endif @@ -147,7 +147,7 @@ gotoblas_t TABLE_NAME = { dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N - dsymm_iutcopyTS, dsymm_iltcopyTS, + dsymm_iutcopyTS, dsymm_iltcopyTS, #else dsymm_outcopyTS, dsymm_oltcopyTS, #endif @@ -171,9 +171,9 @@ gotoblas_t TABLE_NAME = { qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, - qgemm_kernelTS, qgemm_betaTS, + qgemm_kernelTS, qgemm_betaTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N - qgemm_incopyTS, qgemm_itcopyTS, + qgemm_incopyTS, qgemm_itcopyTS, #else qgemm_oncopyTS, qgemm_otcopyTS, #endif @@ -199,7 +199,7 @@ gotoblas_t TABLE_NAME = { qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS, qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N - qsymm_iutcopyTS, qsymm_iltcopyTS, + qsymm_iutcopyTS, qsymm_iltcopyTS, #else qsymm_outcopyTS, qsymm_oltcopyTS, #endif @@ -219,14 +219,14 @@ gotoblas_t TABLE_NAME = { camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, cnrm2_kTS, casum_kTS, ccopy_kTS, cdotu_kTS, cdotc_kTS, csrot_kTS, - caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, + caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, - cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, - cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, - cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, + cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, + cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, + cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, csymv_LTS, csymv_UTS, chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS, - + cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, cgemm_betaTS, @@ -236,10 +236,10 @@ gotoblas_t TABLE_NAME = { cgemm_oncopyTS, cgemm_otcopyTS, #endif cgemm_oncopyTS, cgemm_otcopyTS, - + ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS, ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS, - + #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS, ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS, @@ -249,10 +249,10 @@ gotoblas_t TABLE_NAME = { #endif ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, - + ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS, ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS, - + #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS, ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS, @@ -262,7 +262,7 @@ gotoblas_t TABLE_NAME = { #endif ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, - + #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N csymm_iutcopyTS, csymm_iltcopyTS, #else @@ -275,16 +275,16 @@ gotoblas_t TABLE_NAME = { chemm_outcopyTS, chemm_oltcopyTS, #endif chemm_outcopyTS, chemm_oltcopyTS, - + cgemm3m_kernelTS, - + cgemm3m_incopybTS, cgemm3m_incopyrTS, cgemm3m_incopyiTS, cgemm3m_itcopybTS, cgemm3m_itcopyrTS, cgemm3m_itcopyiTS, cgemm3m_oncopybTS, cgemm3m_oncopyrTS, cgemm3m_oncopyiTS, cgemm3m_otcopybTS, cgemm3m_otcopyrTS, cgemm3m_otcopyiTS, - + csymm3m_iucopybTS, csymm3m_ilcopybTS, csymm3m_iucopyrTS, csymm3m_ilcopyrTS, csymm3m_iucopyiTS, csymm3m_ilcopyiTS, @@ -294,7 +294,7 @@ gotoblas_t TABLE_NAME = { chemm3m_iucopybTS, chemm3m_ilcopybTS, chemm3m_iucopyrTS, chemm3m_ilcopyrTS, - chemm3m_iucopyiTS, chemm3m_ilcopyiTS, + chemm3m_iucopyiTS, chemm3m_ilcopyiTS, chemm3m_oucopybTS, chemm3m_olcopybTS, chemm3m_oucopyrTS, chemm3m_olcopyrTS, @@ -312,11 +312,11 @@ gotoblas_t TABLE_NAME = { zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, znrm2_kTS, zasum_kTS, zcopy_kTS, zdotu_kTS, zdotc_kTS, zdrot_kTS, - zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, + zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, - zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS, - zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS, - zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS, + zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS, + zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS, + zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS, zsymv_LTS, zsymv_UTS, zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS, @@ -329,10 +329,10 @@ gotoblas_t TABLE_NAME = { zgemm_oncopyTS, zgemm_otcopyTS, #endif zgemm_oncopyTS, zgemm_otcopyTS, - + ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS, ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS, - + #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS, ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS, @@ -342,10 +342,10 @@ gotoblas_t TABLE_NAME = { #endif ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS, ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS, - + ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS, ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS, - + #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS, ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS, @@ -355,7 +355,7 @@ gotoblas_t TABLE_NAME = { #endif ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, - + #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N zsymm_iutcopyTS, zsymm_iltcopyTS, #else @@ -368,16 +368,16 @@ gotoblas_t TABLE_NAME = { zhemm_outcopyTS, zhemm_oltcopyTS, #endif zhemm_outcopyTS, zhemm_oltcopyTS, - + zgemm3m_kernelTS, - + zgemm3m_incopybTS, zgemm3m_incopyrTS, zgemm3m_incopyiTS, zgemm3m_itcopybTS, zgemm3m_itcopyrTS, zgemm3m_itcopyiTS, zgemm3m_oncopybTS, zgemm3m_oncopyrTS, zgemm3m_oncopyiTS, zgemm3m_otcopybTS, zgemm3m_otcopyrTS, zgemm3m_otcopyiTS, - + zsymm3m_iucopybTS, zsymm3m_ilcopybTS, zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS, zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS, @@ -387,7 +387,7 @@ gotoblas_t TABLE_NAME = { zhemm3m_iucopybTS, zhemm3m_ilcopybTS, zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS, - zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS, + zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS, zhemm3m_oucopybTS, zhemm3m_olcopybTS, zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, @@ -407,11 +407,11 @@ gotoblas_t TABLE_NAME = { xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, xnrm2_kTS, xasum_kTS, xcopy_kTS, xdotu_kTS, xdotc_kTS, xqrot_kTS, - xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, + xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, - xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS, - xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS, - xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS, + xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS, + xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS, + xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS, xsymv_LTS, xsymv_UTS, xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS, @@ -424,10 +424,10 @@ gotoblas_t TABLE_NAME = { xgemm_oncopyTS, xgemm_otcopyTS, #endif xgemm_oncopyTS, xgemm_otcopyTS, - + xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS, xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS, - + #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS, xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS, @@ -437,10 +437,10 @@ gotoblas_t TABLE_NAME = { #endif xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS, xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS, - + xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS, xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS, - + #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS, xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS, @@ -450,7 +450,7 @@ gotoblas_t TABLE_NAME = { #endif xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS, xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS, - + #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N xsymm_iutcopyTS, xsymm_iltcopyTS, #else @@ -463,16 +463,16 @@ gotoblas_t TABLE_NAME = { xhemm_outcopyTS, xhemm_oltcopyTS, #endif xhemm_outcopyTS, xhemm_oltcopyTS, - + xgemm3m_kernelTS, - + xgemm3m_incopybTS, xgemm3m_incopyrTS, xgemm3m_incopyiTS, xgemm3m_itcopybTS, xgemm3m_itcopyrTS, xgemm3m_itcopyiTS, xgemm3m_oncopybTS, xgemm3m_oncopyrTS, xgemm3m_oncopyiTS, xgemm3m_otcopybTS, xgemm3m_otcopyrTS, xgemm3m_otcopyiTS, - + xsymm3m_iucopybTS, xsymm3m_ilcopybTS, xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS, xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS, @@ -482,7 +482,7 @@ gotoblas_t TABLE_NAME = { xhemm3m_iucopybTS, xhemm3m_ilcopybTS, xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS, - xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS, + xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS, xhemm3m_oucopybTS, xhemm3m_olcopybTS, xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, @@ -518,45 +518,45 @@ static int get_l2_size_old(void){ int info[15]; cpuid(2, &eax, &ebx, &ecx, &edx); - + info[ 0] = BITMASK(eax, 8, 0xff); info[ 1] = BITMASK(eax, 16, 0xff); info[ 2] = BITMASK(eax, 24, 0xff); - + info[ 3] = BITMASK(ebx, 0, 0xff); info[ 4] = BITMASK(ebx, 8, 0xff); info[ 5] = BITMASK(ebx, 16, 0xff); info[ 6] = BITMASK(ebx, 24, 0xff); - + info[ 7] = BITMASK(ecx, 0, 0xff); info[ 8] = BITMASK(ecx, 8, 0xff); info[ 9] = BITMASK(ecx, 16, 0xff); info[10] = BITMASK(ecx, 24, 0xff); - + info[11] = BITMASK(edx, 0, 0xff); info[12] = BITMASK(edx, 8, 0xff); info[13] = BITMASK(edx, 16, 0xff); info[14] = BITMASK(edx, 24, 0xff); - + for (i = 0; i < 15; i++){ - + switch (info[i]){ - + /* This table is from http://www.sandpile.org/ia32/cpuid.htm */ - + case 0x1a : return 96; - + case 0x39 : case 0x3b : case 0x41 : case 0x79 : case 0x81 : return 128; - + case 0x3a : return 192; - + case 0x21 : case 0x3c : case 0x42 : @@ -564,10 +564,10 @@ static int get_l2_size_old(void){ case 0x7e : case 0x82 : return 256; - + case 0x3d : return 384; - + case 0x3e : case 0x43 : case 0x7b : @@ -575,14 +575,14 @@ static int get_l2_size_old(void){ case 0x83 : case 0x86 : return 512; - + case 0x44 : case 0x78 : case 0x7c : case 0x84 : case 0x87 : return 1024; - + case 0x45 : case 0x7d : case 0x85 : @@ -590,10 +590,10 @@ static int get_l2_size_old(void){ case 0x48 : return 3184; - + case 0x49 : return 4096; - + case 0x4e : return 6144; } @@ -899,37 +899,37 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif - TABLE_NAME.sgemm_r = (((BUFFER_SIZE - - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.sgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); - TABLE_NAME.dgemm_r = (((BUFFER_SIZE - - ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.dgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15); #ifdef EXPRECISION - TABLE_NAME.qgemm_r = (((BUFFER_SIZE - - ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.qgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); #endif - TABLE_NAME.cgemm_r = (((BUFFER_SIZE - - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.cgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); - TABLE_NAME.zgemm_r = (((BUFFER_SIZE - - ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.zgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); #ifdef EXPRECISION - TABLE_NAME.xgemm_r = (((BUFFER_SIZE - - ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA - + TABLE_NAME.align) & ~TABLE_NAME.align) + TABLE_NAME.xgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15); #endif diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc index fb6cc2b75..2e8319ce5 100644 --- a/kernel/sparc/KERNEL.sparc +++ b/kernel/sparc/KERNEL.sparc @@ -5,8 +5,8 @@ SGEMMONCOPY = gemm_ncopy.S SGEMMOTCOPY = gemm_tcopy.S SGEMMINCOPYOBJ = SGEMMITCOPYOBJ = -SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) DGEMMKERNEL = gemm_kernel.S DGEMMINCOPY = DGEMMITCOPY = diff --git a/kernel/sparc/axpy.S b/kernel/sparc/axpy.S index 997f9e099..2ada917c8 100644 --- a/kernel/sparc/axpy.S +++ b/kernel/sparc/axpy.S @@ -499,5 +499,5 @@ .LL59: return %i7 + 8 clr %o0 - + EPILOGUE diff --git a/kernel/sparc/cabs.S b/kernel/sparc/cabs.S index 119293e98..d186faba6 100644 --- a/kernel/sparc/cabs.S +++ b/kernel/sparc/cabs.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + PROLOGUE add %sp, -128, %sp diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S index 8063e23da..04810f652 100644 --- a/kernel/sparc/dnrm2.S +++ b/kernel/sparc/dnrm2.S @@ -258,7 +258,7 @@ FCMP c1, fzero fbe .LL99 nop - + FMOV c1, fmax add N, 1, N FDIV fone, c1, fone diff --git a/kernel/sparc/dot.S b/kernel/sparc/dot.S index f89d5f95e..103f0872a 100644 --- a/kernel/sparc/dot.S +++ b/kernel/sparc/dot.S @@ -108,11 +108,11 @@ FCLR(4) FCLR(5) #endif - + cmp N, 0 ble .LL19 nop - + sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY @@ -257,7 +257,7 @@ return %i7 + 8 nop - + .LL50: sra N, 3, I cmp I, 0 diff --git a/kernel/sparc/gemm_kernel_2x8.S b/kernel/sparc/gemm_kernel_2x8.S index c0d257aa0..3d94476da 100644 --- a/kernel/sparc/gemm_kernel_2x8.S +++ b/kernel/sparc/gemm_kernel_2x8.S @@ -1140,7 +1140,7 @@ cmp I, 0 ble,pn %icc, .LL29 nop - + #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) mov B, BO #else @@ -1414,7 +1414,7 @@ mov BO, B .align 4 -.LL30: +.LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL50 diff --git a/kernel/sparc/gemv_n.S b/kernel/sparc/gemv_n.S index 649ef1617..640a96bd3 100644 --- a/kernel/sparc/gemv_n.S +++ b/kernel/sparc/gemv_n.S @@ -46,12 +46,12 @@ #define A %i5 #define LDA %i2 #define X %i3 -#define INCX %i4 +#define INCX %i4 #else #define A %i4 #define LDA %i5 #define X %i2 -#define INCX %i3 +#define INCX %i3 #endif #define Y %l0 @@ -234,7 +234,7 @@ STF FZERO, [Y1 + 7 * SIZE] bg,pn %icc, .LL01 add Y1, 8 * SIZE, Y1 - + .LL10: sra N, 2, J cmp J, 0 @@ -1314,7 +1314,7 @@ add Y1, INCY, Y1 STF y8, [Y1] add Y1, INCY, Y1 - + deccc I bg,pn %icc, .LL991 add BUFFER, 8 * SIZE, BUFFER @@ -1356,7 +1356,7 @@ add Y1, INCY, Y1 STF y4, [Y1] add Y1, INCY, Y1 - + .LL996: andcc M, 2, I ble,pn %icc, .LL997 @@ -1378,7 +1378,7 @@ add Y1, INCY, Y1 STF y2, [Y1] add Y1, INCY, Y1 - + .LL997: andcc M, 1, I ble,pn %icc, .LL999 diff --git a/kernel/sparc/gemv_t.S b/kernel/sparc/gemv_t.S index fad006ade..fc001e4e6 100644 --- a/kernel/sparc/gemv_t.S +++ b/kernel/sparc/gemv_t.S @@ -48,12 +48,12 @@ #define A %i5 #define LDA %i2 #define X %i3 -#define INCX %i4 +#define INCX %i4 #else #define A %i4 #define LDA %i5 #define X %i2 -#define INCX %i3 +#define INCX %i3 #endif #define Y %l0 @@ -218,7 +218,7 @@ #else FCLR(30) #endif - + clr IS mov P, I sll LDA, BASE_SHIFT, LDA @@ -697,7 +697,7 @@ cmp IS, M bl %icc, .LL10 add A, PNLDA, A - + .LL999: return %i7 + 8 clr %o0 diff --git a/kernel/sparc/ger.S b/kernel/sparc/ger.S index 84cd525c4..70b5e2231 100644 --- a/kernel/sparc/ger.S +++ b/kernel/sparc/ger.S @@ -46,12 +46,12 @@ #define X %i5 #define INCX %i2 #define Y %i3 -#define INCY %i4 +#define INCY %i4 #else #define X %i4 #define INCX %i5 #define Y %i2 -#define INCY %i3 +#define INCY %i3 #endif #define A %l0 @@ -251,7 +251,7 @@ deccc J bg,pn %icc, .LL06 nop - + .LL10: mov N, J cmp N, 0 diff --git a/kernel/sparc/imax.S b/kernel/sparc/imax.S index c24e18252..1a2b9c51c 100644 --- a/kernel/sparc/imax.S +++ b/kernel/sparc/imax.S @@ -149,7 +149,7 @@ add I, -1, I cmp I, 0 ble,pt %icc, .LL12 - nop + nop #define PREFETCHSIZE 40 diff --git a/kernel/sparc/lsame.S b/kernel/sparc/lsame.S index 778301fab..c00b565a5 100644 --- a/kernel/sparc/lsame.S +++ b/kernel/sparc/lsame.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define A %o0 #define B %o1 #define AA %o4 diff --git a/kernel/sparc/max.S b/kernel/sparc/max.S index 1a4bc4463..252bc1921 100644 --- a/kernel/sparc/max.S +++ b/kernel/sparc/max.S @@ -135,7 +135,7 @@ add I, -1, I cmp I, 0 ble,pt %icc, .LL12 - nop + nop #define PREFETCHSIZE 40 diff --git a/kernel/sparc/rot.S b/kernel/sparc/rot.S index f5c577047..40e26e9c2 100644 --- a/kernel/sparc/rot.S +++ b/kernel/sparc/rot.S @@ -150,7 +150,7 @@ cmp N, 0 ble .LL19 nop - + sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY @@ -480,7 +480,7 @@ .LL19: return %i7 + 8 nop - + .LL50: mov X, XX mov Y, YY diff --git a/kernel/sparc/scal.S b/kernel/sparc/scal.S index 1414a0930..36d9ce2ab 100644 --- a/kernel/sparc/scal.S +++ b/kernel/sparc/scal.S @@ -119,7 +119,7 @@ #endif FCLR(29) - + FCMP ALPHA, FZERO fbne .LL100 sll INCX, BASE_SHIFT, INCX diff --git a/kernel/sparc/swap.S b/kernel/sparc/swap.S index 1d7950cd8..580eff7bb 100644 --- a/kernel/sparc/swap.S +++ b/kernel/sparc/swap.S @@ -116,7 +116,7 @@ ldx [%sp+ STACK_START + 56], Y ldx [%sp+ STACK_START + 64], INCY #endif - + sll INCX, BASE_SHIFT, INCX sll INCY, BASE_SHIFT, INCY diff --git a/kernel/sparc/trsm_kernel_LN_2x8.S b/kernel/sparc/trsm_kernel_LN_2x8.S index a70f0e428..16e352d18 100644 --- a/kernel/sparc/trsm_kernel_LN_2x8.S +++ b/kernel/sparc/trsm_kernel_LN_2x8.S @@ -2106,7 +2106,7 @@ nop .align 4 -.LL30: +.LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL50 diff --git a/kernel/sparc/trsm_kernel_LT_2x8.S b/kernel/sparc/trsm_kernel_LT_2x8.S index 39015d72e..425a4787f 100644 --- a/kernel/sparc/trsm_kernel_LT_2x8.S +++ b/kernel/sparc/trsm_kernel_LT_2x8.S @@ -2105,7 +2105,7 @@ nop .align 4 -.LL30: +.LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL50 diff --git a/kernel/sparc/trsm_kernel_RT.S b/kernel/sparc/trsm_kernel_RT.S index 3e1a2b90a..eaa6fb2b3 100644 --- a/kernel/sparc/trsm_kernel_RT.S +++ b/kernel/sparc/trsm_kernel_RT.S @@ -2211,7 +2211,7 @@ sub KK, 2, KK #endif -.LL200: +.LL200: sra N, 2, J cmp J, 0 ble,pn %icc, .LL999 diff --git a/kernel/sparc/trsm_kernel_RT_2x8.S b/kernel/sparc/trsm_kernel_RT_2x8.S index c9f68abc0..a030741bc 100644 --- a/kernel/sparc/trsm_kernel_RT_2x8.S +++ b/kernel/sparc/trsm_kernel_RT_2x8.S @@ -217,7 +217,7 @@ .register %g2, #scratch .register %g3, #scratch - + PROLOGUE SAVESP nop diff --git a/kernel/sparc/zamax.S b/kernel/sparc/zamax.S index b156c5a24..ac0966fbb 100644 --- a/kernel/sparc/zamax.S +++ b/kernel/sparc/zamax.S @@ -104,7 +104,7 @@ cmp N, 0 ble .LL20 nop - + cmp INCX, 0 ble .LL20 sll INCX, ZBASE_SHIFT, INCX diff --git a/kernel/sparc/zasum.S b/kernel/sparc/zasum.S index 53bd3c0b0..580b689c3 100644 --- a/kernel/sparc/zasum.S +++ b/kernel/sparc/zasum.S @@ -84,7 +84,7 @@ FCLR(0) sll INCX, ZBASE_SHIFT, INCX - + FMOV c1, c2 FMOV c1, t1 FMOV c1, t2 diff --git a/kernel/sparc/zgemm_kernel.S b/kernel/sparc/zgemm_kernel.S index b02c942e3..444d3a69a 100644 --- a/kernel/sparc/zgemm_kernel.S +++ b/kernel/sparc/zgemm_kernel.S @@ -171,7 +171,7 @@ PROLOGUE SAVESP - + #ifndef __64BIT__ #ifdef DOUBLE #define STACK_ALPHA [%sp + STACK_START + 24] @@ -239,7 +239,7 @@ #else FCLR(29) #endif - + #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif diff --git a/kernel/sparc/zgemm_kernel_1x4.S b/kernel/sparc/zgemm_kernel_1x4.S index 03397fd5c..8b0c8bd84 100644 --- a/kernel/sparc/zgemm_kernel_1x4.S +++ b/kernel/sparc/zgemm_kernel_1x4.S @@ -239,10 +239,10 @@ .register %g2, #scratch .register %g3, #scratch - + PROLOGUE SAVESP - + #ifndef __64BIT__ #ifdef DOUBLE st %i3, [%sp + STACK_START + 16] @@ -1123,7 +1123,7 @@ mov BO, B .align 4 -.LL20: +.LL20: and N, 2, J cmp J, 0 ble,pn %icc, .LL30 diff --git a/kernel/sparc/zgemv_n.S b/kernel/sparc/zgemv_n.S index 46ff43888..3d4ce635f 100644 --- a/kernel/sparc/zgemv_n.S +++ b/kernel/sparc/zgemv_n.S @@ -50,7 +50,7 @@ #define A %i5 #define LDA %i2 #define X %i3 -#define INCX %i4 +#define INCX %i4 #define Y %l0 #define INCY %l1 @@ -255,7 +255,7 @@ STF FZERO, [Y1 + 7 * SIZE] bg,pn %icc, .LL01 add Y1, 8 * SIZE, Y1 - + .LL20: sra N, 1, J cmp J, 0 @@ -1152,7 +1152,7 @@ add Y1, INCY, Y1 add BUFFER, 4 * SIZE, BUFFER - + .LL996: andcc M, 1, I ble,pn %icc, .LL999 diff --git a/kernel/sparc/zgemv_t.S b/kernel/sparc/zgemv_t.S index 2b4a64cad..0007a30d0 100644 --- a/kernel/sparc/zgemv_t.S +++ b/kernel/sparc/zgemv_t.S @@ -46,7 +46,7 @@ #define A %i5 #define LDA %i2 #define X %i3 -#define INCX %i4 +#define INCX %i4 #define Y %l0 #define INCY %l1 @@ -1500,7 +1500,7 @@ add Y2, INCY, Y2 STF a3, [Y2 + 0 * SIZE] STF a4, [Y2 + 1 * SIZE] - + .LL300: andcc N, 1, J FCLR(0) @@ -1729,7 +1729,7 @@ cmp IS, M bl %icc, .LL10 add A, PNLDA, A - + .LL999: return %i7 + 8 clr %o0 diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S index 28e9e074d..065d22784 100644 --- a/kernel/sparc/znrm2.S +++ b/kernel/sparc/znrm2.S @@ -255,7 +255,7 @@ FCMP c1, fzero fbe .LL99 nop - + FMOV c1, fmax FDIV fone, c1, fone diff --git a/kernel/sparc/zrot.S b/kernel/sparc/zrot.S index ec274ca16..a8609fe78 100644 --- a/kernel/sparc/zrot.S +++ b/kernel/sparc/zrot.S @@ -149,7 +149,7 @@ cmp N, 0 ble .LL19 nop - + sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY @@ -490,7 +490,7 @@ .LL19: return %i7 + 8 nop - + .LL50: mov X, XX mov Y, YY diff --git a/kernel/sparc/zscal.S b/kernel/sparc/zscal.S index 5c6ade382..46bb6b2f6 100644 --- a/kernel/sparc/zscal.S +++ b/kernel/sparc/zscal.S @@ -170,7 +170,7 @@ #else FCLR(24) #endif - + FCMP ALPHA_R, FZERO fbne .LL100 sll INCX, ZBASE_SHIFT, INCX diff --git a/kernel/sparc/zswap.S b/kernel/sparc/zswap.S index 88ed22169..70360d652 100644 --- a/kernel/sparc/zswap.S +++ b/kernel/sparc/zswap.S @@ -119,7 +119,7 @@ ldx [%sp + STACK_START + 64], Y ldx [%sp + STACK_START + 72], INCY #endif - + sll INCX, ZBASE_SHIFT, INCX sll INCY, ZBASE_SHIFT, INCY diff --git a/kernel/sparc/ztrsm_kernel_LN.S b/kernel/sparc/ztrsm_kernel_LN.S index 131284e8e..8d6f5e791 100644 --- a/kernel/sparc/ztrsm_kernel_LN.S +++ b/kernel/sparc/ztrsm_kernel_LN.S @@ -172,7 +172,7 @@ PROLOGUE SAVESP - + #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A @@ -264,7 +264,7 @@ cmp I, 0 ble,pn %icc, .LL50 nop - + #if defined(LT) || defined(RN) sra KK, 2, L @@ -1094,7 +1094,7 @@ cmp L, 0 ble,pn %icc, .LL29 nop - + .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 diff --git a/kernel/sparc/ztrsm_kernel_LT.S b/kernel/sparc/ztrsm_kernel_LT.S index 2a8569850..cfd1c8cad 100644 --- a/kernel/sparc/ztrsm_kernel_LT.S +++ b/kernel/sparc/ztrsm_kernel_LT.S @@ -172,7 +172,7 @@ PROLOGUE SAVESP - + #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A @@ -668,7 +668,7 @@ cmp L, 0 ble,pn %icc, .LL29 nop - + .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 @@ -1130,7 +1130,7 @@ FMOV FZERO, t1 ble,pn %icc, .LL99 FMOV FZERO, c04 - + #if defined(LT) || defined(RN) sra KK, 2, L diff --git a/kernel/sparc/ztrsm_kernel_LT_1x4.S b/kernel/sparc/ztrsm_kernel_LT_1x4.S index f7d9e38ed..8b6314368 100644 --- a/kernel/sparc/ztrsm_kernel_LT_1x4.S +++ b/kernel/sparc/ztrsm_kernel_LT_1x4.S @@ -66,7 +66,7 @@ #define TEMP1 %l6 #define TEMP2 %l7 #define AORIG %o7 - + #ifdef DOUBLE #define c01 %f0 #define c02 %f2 @@ -223,10 +223,10 @@ .register %g2, #scratch .register %g3, #scratch - + PROLOGUE SAVESP - + #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A @@ -1429,7 +1429,7 @@ nop .align 4 -.LL20: +.LL20: and N, 2, J cmp J, 0 ble,pn %icc, .LL30 diff --git a/kernel/sparc/ztrsm_kernel_RT.S b/kernel/sparc/ztrsm_kernel_RT.S index 2949e4843..5b36b58a2 100644 --- a/kernel/sparc/ztrsm_kernel_RT.S +++ b/kernel/sparc/ztrsm_kernel_RT.S @@ -172,7 +172,7 @@ PROLOGUE SAVESP - + #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A @@ -1483,7 +1483,7 @@ cmp L, 0 ble,pn %icc, .LL29 nop - + .LL26: FADD2 c04, t1, c04 LDF [AO + 3 * SIZE], a4 @@ -1945,7 +1945,7 @@ FMOV FZERO, t1 ble,pn %icc, .LL99 FMOV FZERO, c04 - + #if defined(LT) || defined(RN) sra KK, 2, L diff --git a/kernel/sparc/ztrsm_kernel_RT_1x4.S b/kernel/sparc/ztrsm_kernel_RT_1x4.S index 49d449ab9..668974bcd 100644 --- a/kernel/sparc/ztrsm_kernel_RT_1x4.S +++ b/kernel/sparc/ztrsm_kernel_RT_1x4.S @@ -66,7 +66,7 @@ #define TEMP1 %l6 #define TEMP2 %l7 #define AORIG %o7 - + #ifdef DOUBLE #define c01 %f0 #define c02 %f2 @@ -224,10 +224,10 @@ .register %g2, #scratch .register %g3, #scratch - + PROLOGUE SAVESP - + #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 32], A diff --git a/kernel/x86/KERNEL.ATOM b/kernel/x86/KERNEL.ATOM index b0f673350..7897cb940 100644 --- a/kernel/x86/KERNEL.ATOM +++ b/kernel/x86/KERNEL.ATOM @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_penryn.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x2_atom.S diff --git a/kernel/x86/KERNEL.BARCELONA b/kernel/x86/KERNEL.BARCELONA index 231350a62..d984f8fa0 100644 --- a/kernel/x86/KERNEL.BARCELONA +++ b/kernel/x86/KERNEL.BARCELONA @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_barcelona.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S @@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S diff --git a/kernel/x86/KERNEL.BOBCAT b/kernel/x86/KERNEL.BOBCAT index 231350a62..d984f8fa0 100644 --- a/kernel/x86/KERNEL.BOBCAT +++ b/kernel/x86/KERNEL.BOBCAT @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_barcelona.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S @@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S diff --git a/kernel/x86/KERNEL.BULLDOZER b/kernel/x86/KERNEL.BULLDOZER index 231350a62..d984f8fa0 100644 --- a/kernel/x86/KERNEL.BULLDOZER +++ b/kernel/x86/KERNEL.BULLDOZER @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_barcelona.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S @@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S diff --git a/kernel/x86/KERNEL.DUNNINGTON b/kernel/x86/KERNEL.DUNNINGTON index 08e35438f..f2b0f9611 100644 --- a/kernel/x86/KERNEL.DUNNINGTON +++ b/kernel/x86/KERNEL.DUNNINGTON @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_penryn.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_penryn.S diff --git a/kernel/x86/KERNEL.OPTERON b/kernel/x86/KERNEL.OPTERON index 7b8b1373f..c065bf784 100644 --- a/kernel/x86/KERNEL.OPTERON +++ b/kernel/x86/KERNEL.OPTERON @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_sse.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_sse2.S @@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_sse.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_sse2.S diff --git a/kernel/x86/KERNEL.PENRYN b/kernel/x86/KERNEL.PENRYN index 08e35438f..f2b0f9611 100644 --- a/kernel/x86/KERNEL.PENRYN +++ b/kernel/x86/KERNEL.PENRYN @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_penryn.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_penryn.S diff --git a/kernel/x86/KERNEL.PILEDRIVER b/kernel/x86/KERNEL.PILEDRIVER index 231350a62..d984f8fa0 100644 --- a/kernel/x86/KERNEL.PILEDRIVER +++ b/kernel/x86/KERNEL.PILEDRIVER @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_barcelona.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_barcelona.S @@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S diff --git a/kernel/x86/KERNEL.PRESCOTT b/kernel/x86/KERNEL.PRESCOTT index 355e00fcf..b8e1e7502 100644 --- a/kernel/x86/KERNEL.PRESCOTT +++ b/kernel/x86/KERNEL.PRESCOTT @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_sse3.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_sse3.S @@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_sse3.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S diff --git a/kernel/x86/KERNEL.YONAH b/kernel/x86/KERNEL.YONAH index 5b3ecaea6..5149f67ab 100644 --- a/kernel/x86/KERNEL.YONAH +++ b/kernel/x86/KERNEL.YONAH @@ -1,10 +1,10 @@ SGEMMKERNEL = gemm_kernel_4x4_sse3.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_2x4_sse3.S @@ -17,12 +17,12 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = zgemm_kernel_2x2_sse3.S -CGEMMINCOPY = -CGEMMITCOPY = +CGEMMINCOPY = +CGEMMITCOPY = CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = +CGEMMITCOPYOBJ = CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S diff --git a/kernel/x86/amax.S b/kernel/x86/amax.S index 01c2bd60e..2a3404c18 100644 --- a/kernel/x86/amax.S +++ b/kernel/x86/amax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -92,7 +92,7 @@ FLD (X) #ifdef USE_ABS - fabs + fabs #endif addl INCX, X decl M @@ -105,7 +105,7 @@ sarl $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -113,7 +113,7 @@ FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -121,7 +121,7 @@ FLD 1 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -129,7 +129,7 @@ FLD 2 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -137,7 +137,7 @@ FLD 3 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -145,7 +145,7 @@ FLD 4 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -153,7 +153,7 @@ FLD 5 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -161,7 +161,7 @@ FLD 6 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -169,7 +169,7 @@ FLD 7 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -191,7 +191,7 @@ .L21: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -208,12 +208,12 @@ sarl $3, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -222,7 +222,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -231,7 +231,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -240,7 +240,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -249,7 +249,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -258,7 +258,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -267,7 +267,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -276,7 +276,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -296,7 +296,7 @@ .L61: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) diff --git a/kernel/x86/amax_sse.S b/kernel/x86/amax_sse.S index 05d21a7eb..e988660e5 100644 --- a/kernel/x86/amax_sse.S +++ b/kernel/x86/amax_sse.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -50,7 +50,7 @@ #define X %ecx #define INCX %edx #define I %eax - + #ifdef USE_MIN #define maxps minps #define maxss minss @@ -155,7 +155,7 @@ decl I jle .L12 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -326,7 +326,7 @@ #endif maxps %xmm4, %xmm0 addl $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L18: testl $2, M @@ -340,7 +340,7 @@ maxps %xmm4, %xmm1 addl $2 * SIZE, X ALIGN_3 - + .L19: testl $1, M je .L998 @@ -358,7 +358,7 @@ sarl $3, I jle .L45 ALIGN_4 - + .L41: movss (X), %xmm4 addl INCX, X @@ -451,7 +451,7 @@ andps %xmm3, %xmm7 #endif maxss %xmm7, %xmm1 - ALIGN_3 + ALIGN_3 .L46: testl $2, M @@ -471,7 +471,7 @@ #endif maxss %xmm5, %xmm1 ALIGN_3 - + .L47: testl $1, M je .L998 @@ -493,7 +493,7 @@ shufps $1, %xmm0, %xmm0 maxss %xmm1, %xmm0 ALIGN_4 - + .L999: subl $8, %esp diff --git a/kernel/x86/amax_sse2.S b/kernel/x86/amax_sse2.S index ad56244b2..e21927cb1 100644 --- a/kernel/x86/amax_sse2.S +++ b/kernel/x86/amax_sse2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -50,7 +50,7 @@ #define X %ecx #define INCX %edx #define I %eax - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd @@ -128,7 +128,7 @@ decl I jle .L12 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -286,7 +286,7 @@ maxpd %xmm5, %xmm1 addl $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L17: testl $2, M @@ -298,8 +298,8 @@ #endif maxpd %xmm4, %xmm0 addl $2 * SIZE, X - ALIGN_3 - + ALIGN_3 + .L18: testl $1, M jle .L998 @@ -318,7 +318,7 @@ sarl $4, I jle .L45 ALIGN_4 - + .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -469,7 +469,7 @@ andps %xmm3, %xmm5 #endif maxpd %xmm5, %xmm1 - ALIGN_3 + ALIGN_3 .L47: testl $2, M @@ -484,7 +484,7 @@ #endif maxpd %xmm6, %xmm0 ALIGN_3 - + .L48: testl $1, M je .L998 diff --git a/kernel/x86/asum.S b/kernel/x86/asum.S index e1b0a6eb7..8c90f351f 100644 --- a/kernel/x86/asum.S +++ b/kernel/x86/asum.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -49,7 +49,7 @@ #define M %edx #define X %ecx #define INCX %esi - + #define I %eax #include "l1param.h" @@ -91,7 +91,7 @@ sarl $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -153,7 +153,7 @@ sarl $3, I jle .L60 ALIGN_4 - + .L50: FLD (X) addl INCX, X diff --git a/kernel/x86/asum_sse.S b/kernel/x86/asum_sse.S index 4506f299c..fd2492c1f 100644 --- a/kernel/x86/asum_sse.S +++ b/kernel/x86/asum_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 @@ -123,7 +123,7 @@ decl I jle .L12 ALIGN_3 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -278,7 +278,7 @@ sarl $3, I jle .L105 ALIGN_4 - + .L101: movss (X), %xmm4 addl INCX, X @@ -344,7 +344,7 @@ #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 diff --git a/kernel/x86/asum_sse2.S b/kernel/x86/asum_sse2.S index cea350369..a522fdf4e 100644 --- a/kernel/x86/asum_sse2.S +++ b/kernel/x86/asum_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 @@ -105,7 +105,7 @@ decl I jle .L11 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -217,7 +217,7 @@ addpd %xmm5, %xmm1 addl $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L22: testl $2, M @@ -227,7 +227,7 @@ andps %xmm3, %xmm4 addpd %xmm4, %xmm0 addl $2 * SIZE, X - + .L23: testl $1, M je .L999 @@ -246,7 +246,7 @@ sarl $3, I jle .L60 ALIGN_4 - + .L50: movsd -16 * SIZE(X), %xmm4 addl INCX, X diff --git a/kernel/x86/axpy.S b/kernel/x86/axpy.S index 7f3d99e44..6d9da4e70 100644 --- a/kernel/x86/axpy.S +++ b/kernel/x86/axpy.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #ifdef DOUBLE @@ -153,7 +153,7 @@ #ifdef HAVE_3DNOW prefetchw 24 * SIZE(Y) #endif - + addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax diff --git a/kernel/x86/axpy_sse.S b/kernel/x86/axpy_sse.S index e06d90184..590e9b194 100644 --- a/kernel/x86/axpy_sse.S +++ b/kernel/x86/axpy_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 20 + STACK + ARGS(%esp) @@ -82,7 +82,7 @@ testl M, M jle .L19 - + cmpl $SIZE, INCX jne .L50 cmpl $SIZE, INCY @@ -354,7 +354,7 @@ .L20: #ifdef ALIGNED_ACCESS - + testl $SIZE, X jne .L30 diff --git a/kernel/x86/axpy_sse2.S b/kernel/x86/axpy_sse2.S index 9b2d5d808..058747c43 100644 --- a/kernel/x86/axpy_sse2.S +++ b/kernel/x86/axpy_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) @@ -55,7 +55,7 @@ #define INCX %ecx #define INCY %edx #define YY %ebp - + #define ALPHA %xmm7 #include "l1param.h" @@ -605,7 +605,7 @@ movsd -2 * SIZE(X), %xmm3 movhps -1 * SIZE(X), %xmm3 - + subl $-8 * SIZE, Y subl $-8 * SIZE, X decl %eax diff --git a/kernel/x86/axpy_sse2_opteron.S b/kernel/x86/axpy_sse2_opteron.S index fb22415ba..bc7e9ea1c 100644 --- a/kernel/x86/axpy_sse2_opteron.S +++ b/kernel/x86/axpy_sse2_opteron.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) @@ -54,7 +54,7 @@ #define Y %edi #define INCX %ecx #define INCY %edx - + #define PREFETCHSIZE 64 PROLOGUE diff --git a/kernel/x86/copy.S b/kernel/x86/copy.S index 721d5c5d9..cf4ab203e 100644 --- a/kernel/x86/copy.S +++ b/kernel/x86/copy.S @@ -41,13 +41,13 @@ #define STACK 12 #define ARGS 0 - + #define M 4 + STACK + ARGS(%esp) #define X 8 + STACK + ARGS(%esp) #define INCX 12 + STACK + ARGS(%esp) #define Y 16 + STACK + ARGS(%esp) #define INCY 20 + STACK + ARGS(%esp) - + PROLOGUE pushl %edi @@ -88,14 +88,14 @@ ALIGN_2 .L11: - FLD 7 * SIZE(%ecx) - FLD 6 * SIZE(%ecx) - FLD 5 * SIZE(%ecx) - FLD 4 * SIZE(%ecx) - FLD 3 * SIZE(%ecx) - FLD 2 * SIZE(%ecx) - FLD 1 * SIZE(%ecx) - FLD 0 * SIZE(%ecx) + FLD 7 * SIZE(%ecx) + FLD 6 * SIZE(%ecx) + FLD 5 * SIZE(%ecx) + FLD 4 * SIZE(%ecx) + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) diff --git a/kernel/x86/copy_sse.S b/kernel/x86/copy_sse.S index 34902dcac..c6d17b1e5 100644 --- a/kernel/x86/copy_sse.S +++ b/kernel/x86/copy_sse.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) diff --git a/kernel/x86/copy_sse2.S b/kernel/x86/copy_sse2.S index 11524aa1f..9a74fe95f 100644 --- a/kernel/x86/copy_sse2.S +++ b/kernel/x86/copy_sse2.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) diff --git a/kernel/x86/cpuid.S b/kernel/x86/cpuid.S index 773b67dd9..749339159 100644 --- a/kernel/x86/cpuid.S +++ b/kernel/x86/cpuid.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + PROLOGUE PROFCODE diff --git a/kernel/x86/dot.S b/kernel/x86/dot.S index 5bd5d282e..7f717834d 100644 --- a/kernel/x86/dot.S +++ b/kernel/x86/dot.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -79,8 +79,8 @@ movl (INCY),INCY #endif - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY fldz fldz diff --git a/kernel/x86/dot_amd.S b/kernel/x86/dot_amd.S index 75ad36ee6..35f0066c5 100644 --- a/kernel/x86/dot_amd.S +++ b/kernel/x86/dot_amd.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -78,8 +78,8 @@ movl (INCY),INCY #endif - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY fldz fldz diff --git a/kernel/x86/dot_sse.S b/kernel/x86/dot_sse.S index 181192119..392ac49c7 100644 --- a/kernel/x86/dot_sse.S +++ b/kernel/x86/dot_sse.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -75,8 +75,8 @@ movl (INCY),INCY # INCY #endif - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 @@ -1297,13 +1297,13 @@ #elif defined(HAVE_SSE2) movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + PSHUFD2($1, %xmm0, %xmm1) addss %xmm1, %xmm0 #else movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 diff --git a/kernel/x86/dot_sse2.S b/kernel/x86/dot_sse2.S index f2053d2ea..9f5fa4201 100644 --- a/kernel/x86/dot_sse2.S +++ b/kernel/x86/dot_sse2.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -69,8 +69,8 @@ movl STACK_Y, Y movl STACK_INCY, INCY - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 diff --git a/kernel/x86/dot_sse2_opteron.S b/kernel/x86/dot_sse2_opteron.S index 7ac059f63..0b9da6b53 100644 --- a/kernel/x86/dot_sse2_opteron.S +++ b/kernel/x86/dot_sse2_opteron.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -76,8 +76,8 @@ movl (INCY),INCY # INCY #endif - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 diff --git a/kernel/x86/dot_sse_opteron.S b/kernel/x86/dot_sse_opteron.S index fc632193f..0d8dfc00f 100644 --- a/kernel/x86/dot_sse_opteron.S +++ b/kernel/x86/dot_sse_opteron.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -76,8 +76,8 @@ movl (INCY),INCY # INCY #endif - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -392,7 +392,7 @@ #if !defined(HAVE_SSE3) || defined(__INTERIX) movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + pshufd $1, %xmm0, %xmm1 addss %xmm1, %xmm0 #else diff --git a/kernel/x86/gemm_beta.S b/kernel/x86/gemm_beta.S index b68dcf3d9..8592fe500 100644 --- a/kernel/x86/gemm_beta.S +++ b/kernel/x86/gemm_beta.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #ifdef DOUBLE @@ -169,7 +169,7 @@ FLD 4 * SIZE(%eax) fmul %st(1),%st FST 4 * SIZE(%eax) - + FLD 5 * SIZE(%eax) fmul %st(1),%st FST 5 * SIZE(%eax) diff --git a/kernel/x86/gemm_kernel_1x4.S b/kernel/x86/gemm_kernel_1x4.S index e1ff4e809..8e248b8a7 100644 --- a/kernel/x86/gemm_kernel_1x4.S +++ b/kernel/x86/gemm_kernel_1x4.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) @@ -70,7 +70,7 @@ #define BB %ebx #define LDC %ebp #define BX %esi - + #define PREFETCHSIZE (8 * 5 + 4) #define AOFFSET 1 @@ -334,7 +334,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl STACK_B, B @@ -393,7 +393,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -529,7 +529,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl STACK_B, B @@ -568,7 +568,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -733,7 +733,7 @@ #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C movl B, STACK_B ALIGN_4 @@ -747,7 +747,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl STACK_B, B @@ -782,7 +782,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -892,7 +892,7 @@ addl $1, KK #endif - addl LDC, C + addl LDC, C movl B, STACK_B ALIGN_4 diff --git a/kernel/x86/gemm_kernel_2x2.S b/kernel/x86/gemm_kernel_2x2.S index 1483bc4d9..f513f6d6a 100644 --- a/kernel/x86/gemm_kernel_2x2.S +++ b/kernel/x86/gemm_kernel_2x2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) @@ -72,7 +72,7 @@ #else #define REP rep #endif - + PROLOGUE subl $ARGS, %esp # Generate Stack Frame @@ -89,12 +89,12 @@ negl %eax movl %eax, KK #endif - + movl N, %eax # j = (n >> 1) # MEMORY movl LDC, %ebp # ldc # MEMORY movl B, %ebx - sarl $1, %eax + sarl $1, %eax leal (, %ebp, SIZE), %ebp leal 0(%ecx) , %ecx # NOP movl %eax, J # j = (n >> 1) # MEMORY @@ -106,7 +106,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl %ebx, BX @@ -127,7 +127,7 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 2), %ecx -#endif +#endif #ifdef HAVE_SSE movl BX, %eax @@ -164,7 +164,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -327,7 +327,7 @@ ffreep %st(0) ffreep %st(0) - FLD ALPHA + FLD ALPHA fmul %st, %st(4) fmul %st, %st(1) fmul %st, %st(2) @@ -384,7 +384,7 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 2), %ecx -#endif +#endif fldz fldz @@ -395,7 +395,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -504,11 +504,11 @@ movl N, %eax # n # MEMORY andl $1, %eax je .End - + #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, %edi # c # MEMORY movl A, %edx # a # MEMORY @@ -517,7 +517,7 @@ sarl $1, %esi # m >> 1 je .L36 ALIGN_4 - + .L46: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ @@ -528,14 +528,14 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 1), %ecx -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -648,14 +648,14 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 1), %ecx -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_2x2_atom.S b/kernel/x86/gemm_kernel_2x2_atom.S index f8954128a..14f0d20bd 100644 --- a/kernel/x86/gemm_kernel_2x2_atom.S +++ b/kernel/x86/gemm_kernel_2x2_atom.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -84,7 +84,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -100,7 +100,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 1, %eax @@ -129,7 +129,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB -#endif +#endif movl BX, %eax prefetcht0 0 * SIZE(%eax) @@ -151,7 +151,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -319,7 +319,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -332,7 +332,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -467,7 +467,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, CO1 addl LDC, C @@ -490,7 +490,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB -#endif +#endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 @@ -504,7 +504,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -635,7 +635,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -647,7 +647,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_2x4_3dnow.S b/kernel/x86/gemm_kernel_2x4_3dnow.S index a86efda26..207ae625f 100644 --- a/kernel/x86/gemm_kernel_2x4_3dnow.S +++ b/kernel/x86/gemm_kernel_2x4_3dnow.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -114,7 +114,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla movl OLD_B, %edi movl OLD_C, %ebx punpckldq %mm3, %mm3 - + movq %mm3, ALPHA movl %ebx, C @@ -143,13 +143,13 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax jle .L03 ALIGN_3 - + .L02: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 @@ -239,7 +239,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla decl %eax jne .L04 ALIGN_4 - + .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -282,7 +282,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -621,7 +621,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -948,13 +948,13 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax jle .L33 ALIGN_3 - + .L32: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 @@ -1012,7 +1012,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla decl %eax jne .L34 ALIGN_4 - + .L40: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1051,7 +1051,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1279,7 +1279,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1495,13 +1495,13 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $3, %eax jle .L63 ALIGN_3 - + .L62: movd 0 * SIZE(%edi), %mm0 movd 1 * SIZE(%edi), %mm1 @@ -1554,7 +1554,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla decl %eax jne .L64 ALIGN_4 - + .L70: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1592,7 +1592,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1763,7 +1763,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_2x4_barcelona.S b/kernel/x86/gemm_kernel_2x4_barcelona.S index 1acdc16c5..04035c78a 100644 --- a/kernel/x86/gemm_kernel_2x4_barcelona.S +++ b/kernel/x86/gemm_kernel_2x4_barcelona.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -203,7 +203,7 @@ #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -223,7 +223,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax movl %eax, BX @@ -247,7 +247,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -275,7 +275,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -492,7 +492,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -508,7 +508,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -665,7 +665,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, CO # coffset = c movl A, AO # aoffset = a @@ -686,7 +686,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -701,7 +701,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -866,7 +866,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -879,7 +879,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1005,7 +1005,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, CO # coffset = c movl A, AO # aoffset = a @@ -1026,7 +1026,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 1), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm4, %xmm4 @@ -1041,7 +1041,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1167,7 +1167,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 1), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -1180,7 +1180,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_2x4_core2.S b/kernel/x86/gemm_kernel_2x4_core2.S index 9907131d6..bc2775e31 100644 --- a/kernel/x86/gemm_kernel_2x4_core2.S +++ b/kernel/x86/gemm_kernel_2x4_core2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -87,7 +87,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -106,7 +106,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl B, BX @@ -160,7 +160,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -446,7 +446,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -617,7 +617,7 @@ addl $1, KK #endif ALIGN_4 - + .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -639,7 +639,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -677,7 +677,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -875,7 +875,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1008,7 +1008,7 @@ addl $1, KK #endif ALIGN_4 - + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -1028,7 +1028,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -1063,7 +1063,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1219,7 +1219,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1305,7 +1305,7 @@ movsd %xmm4, 0 * SIZE(C1) ALIGN_4 - + .L999: popl %ebx popl %esi diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S index 0bdc9185c..b3bfa9a17 100644 --- a/kernel/x86/gemm_kernel_2x4_penryn.S +++ b/kernel/x86/gemm_kernel_2x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -116,7 +116,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -135,7 +135,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 2, %eax @@ -190,7 +190,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -501,7 +501,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -672,7 +672,7 @@ addl $1, KK #endif ALIGN_4 - + .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -694,7 +694,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -732,7 +732,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -930,7 +930,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1063,7 +1063,7 @@ addl $1, KK #endif ALIGN_4 - + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -1083,7 +1083,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -1118,7 +1118,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1274,7 +1274,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1360,7 +1360,7 @@ movsd %xmm4, 0 * SIZE(C1) ALIGN_4 - + .L999: popl %ebx popl %esi diff --git a/kernel/x86/gemm_kernel_2x4_sse2.S b/kernel/x86/gemm_kernel_2x4_sse2.S index be58235ee..c587fba33 100644 --- a/kernel/x86/gemm_kernel_2x4_sse2.S +++ b/kernel/x86/gemm_kernel_2x4_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -238,7 +238,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif leal (, LDC, SIZE), LDC @@ -252,7 +252,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -260,7 +260,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: #define COPYPREFETCH 40 @@ -321,7 +321,7 @@ addl $4 * SIZE, %edi ALIGN_4 - + .L10: movl %edi, BX @@ -344,7 +344,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movl BX, %eax @@ -375,7 +375,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -390,7 +390,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -606,7 +606,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -624,7 +624,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -841,7 +841,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -849,7 +849,7 @@ sarl $2, %eax jle .L35 ALIGN_4 - + .L32: #ifdef PENTIUM4 #ifdef HAVE_SSE3 @@ -981,7 +981,7 @@ decl %eax jne .L36 ALIGN_4 - + .L40: movl C, %esi # coffset = c movl A, AA # aoffset = a @@ -1002,7 +1002,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1029,7 +1029,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1196,7 +1196,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1215,7 +1215,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1368,14 +1368,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L65 ALIGN_4 - + .L62: #ifdef PENTIUM4 #ifdef HAVE_SSE3 @@ -1496,7 +1496,7 @@ decl %eax jne .L66 ALIGN_4 - + .L70: movl C, %esi # coffset = c movl A, AA # aoffset = a @@ -1517,7 +1517,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1542,7 +1542,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1670,7 +1670,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1689,7 +1689,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1780,7 +1780,7 @@ .L999: movl OLD_STACK, %esp EMMS - + popl %ebx popl %esi popl %edi diff --git a/kernel/x86/gemm_kernel_2x4_sse3.S b/kernel/x86/gemm_kernel_2x4_sse3.S index e2732daf8..dc2ff05e6 100644 --- a/kernel/x86/gemm_kernel_2x4_sse3.S +++ b/kernel/x86/gemm_kernel_2x4_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -213,7 +213,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -229,7 +229,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 2, %eax @@ -255,7 +255,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB -#endif +#endif movl BX, %eax prefetcht2 0 * SIZE(%eax) @@ -284,7 +284,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -299,7 +299,7 @@ andl $-8, %eax sall $4, %eax je .L15 - + .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) @@ -741,7 +741,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -757,7 +757,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -968,7 +968,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) @@ -986,7 +986,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1012,7 +1012,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1179,7 +1179,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1195,7 +1195,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1348,7 +1348,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) @@ -1366,7 +1366,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1386,7 +1386,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1513,7 +1513,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1529,7 +1529,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_4x2_core2.S b/kernel/x86/gemm_kernel_4x2_core2.S index 641b5fc46..edaebcba3 100644 --- a/kernel/x86/gemm_kernel_4x2_core2.S +++ b/kernel/x86/gemm_kernel_4x2_core2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -121,7 +121,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif subl $-16 * SIZE, A @@ -140,13 +140,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 - + .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -193,7 +193,7 @@ decl %eax jne .L06 ALIGN_4 - + .L10: movl B, BX @@ -239,7 +239,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -500,7 +500,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $2, %eax @@ -664,7 +664,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -807,13 +807,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $3, %eax jle .L45 ALIGN_4 - + .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -855,7 +855,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, C1 movl A, AA @@ -893,7 +893,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1060,7 +1060,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1199,7 +1199,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -1292,7 +1292,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/gemm_kernel_4x2_sse2.S b/kernel/x86/gemm_kernel_4x2_sse2.S index 2e67afaf9..ea93225d7 100644 --- a/kernel/x86/gemm_kernel_4x2_sse2.S +++ b/kernel/x86/gemm_kernel_4x2_sse2.S @@ -47,7 +47,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -57,7 +57,7 @@ #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) - + #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) @@ -242,7 +242,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $BASE_SHIFT, LDC @@ -250,12 +250,12 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -263,7 +263,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -324,7 +324,7 @@ BRANCH jne .L04 ALIGN_4 - + .L05: movl B, BX @@ -368,7 +368,7 @@ movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC) @@ -383,7 +383,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -401,7 +401,7 @@ je .L12 sall $3, %eax .align 8 - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -494,7 +494,7 @@ subl $64 * 8, %eax BRANCH jg .L1X - + .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB @@ -503,7 +503,7 @@ sarl $3, %eax je .L12 -.L11: +.L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -548,7 +548,7 @@ addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 - ALIGN_4 + ALIGN_4 .L14: mulpd %xmm3, %xmm4 @@ -597,7 +597,7 @@ BRANCH jg .L10 jmp .L30 - ALIGN_2 + ALIGN_2 .L18x: #ifndef TRMMKERNEL @@ -683,14 +683,14 @@ pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $2, %eax @@ -699,7 +699,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 @@ -861,14 +861,14 @@ pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -881,7 +881,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 @@ -1001,12 +1001,12 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -1015,7 +1015,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -1072,7 +1072,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1112,7 +1112,7 @@ pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL @@ -1120,7 +1120,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1133,7 +1133,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 @@ -1255,7 +1255,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -1292,7 +1292,7 @@ pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL @@ -1300,7 +1300,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1313,7 +1313,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 @@ -1438,14 +1438,14 @@ pxor %xmm6, %xmm6 movapd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -1454,7 +1454,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S index f081aec2a..df11ba5c3 100644 --- a/kernel/x86/gemm_kernel_4x4_barcelona.S +++ b/kernel/x86/gemm_kernel_4x4_barcelona.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -203,7 +203,7 @@ andl $-1024, %esp # align stack STACK_TOUCHING - + movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx @@ -230,7 +230,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif leal (, LDC, SIZE), LDC @@ -242,7 +242,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -251,7 +251,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) @@ -307,7 +307,7 @@ addl $4 * SIZE, %edi ALIGN_4 - + .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -328,7 +328,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -351,7 +351,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -365,7 +365,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -563,7 +563,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -579,7 +579,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -809,7 +809,7 @@ leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -825,7 +825,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1047,14 +1047,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 - + .L42: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) @@ -1111,7 +1111,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1132,7 +1132,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1152,7 +1152,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1313,7 +1313,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1332,7 +1332,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1496,7 +1496,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1515,7 +1515,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1651,7 +1651,7 @@ addl $2, KK #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C ALIGN_4 .L80: @@ -1661,7 +1661,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1669,7 +1669,7 @@ sarl $3, %eax jle .L85 ALIGN_4 - + .L82: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) @@ -1722,7 +1722,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1743,7 +1743,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1762,7 +1762,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1889,7 +1889,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1908,7 +1908,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2034,7 +2034,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2053,7 +2053,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S index 2d51d9711..e3f73843f 100644 --- a/kernel/x86/gemm_kernel_4x4_penryn.S +++ b/kernel/x86/gemm_kernel_4x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -116,7 +116,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -135,7 +135,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 2, %eax @@ -190,7 +190,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -483,7 +483,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -691,7 +691,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -854,7 +854,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -895,7 +895,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1091,7 +1091,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1251,7 +1251,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1397,7 +1397,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -1434,7 +1434,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1593,7 +1593,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1743,7 +1743,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_4x4_sse.S b/kernel/x86/gemm_kernel_4x4_sse.S index b360a58da..5503344cd 100644 --- a/kernel/x86/gemm_kernel_4x4_sse.S +++ b/kernel/x86/gemm_kernel_4x4_sse.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -207,7 +207,7 @@ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; #endif - + #ifdef PENTIUM4 #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ @@ -333,7 +333,7 @@ PROFCODE EMMS - + movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx @@ -367,7 +367,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif leal (, LDC, SIZE), LDC @@ -379,7 +379,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -388,7 +388,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 @@ -524,7 +524,7 @@ #endif addl $4 * SIZE, %edi ALIGN_4 - + .L10: movl %edi, BX @@ -547,7 +547,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movl BX, %eax @@ -607,7 +607,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -622,7 +622,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -841,7 +841,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -857,7 +857,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1099,7 +1099,7 @@ leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -1115,7 +1115,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1337,14 +1337,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 - + .L42: prefetchnta 80 * SIZE(%edi) @@ -1469,7 +1469,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1490,7 +1490,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1515,7 +1515,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1676,7 +1676,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1695,7 +1695,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1865,7 +1865,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1884,7 +1884,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2020,7 +2020,7 @@ addl $2, KK #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C ALIGN_4 .L80: @@ -2030,14 +2030,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 - + .L82: prefetchnta 80 * SIZE(%edi) @@ -2151,7 +2151,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -2172,7 +2172,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2195,7 +2195,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2322,7 +2322,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2341,7 +2341,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2470,7 +2470,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2489,7 +2489,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_4x4_sse3.S b/kernel/x86/gemm_kernel_4x4_sse3.S index 78efab6c4..12581d9ee 100644 --- a/kernel/x86/gemm_kernel_4x4_sse3.S +++ b/kernel/x86/gemm_kernel_4x4_sse3.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -247,7 +247,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif leal (, LDC, SIZE), LDC @@ -259,7 +259,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -268,7 +268,7 @@ sarl $2, %eax jle .L05 ALIGN_4 - + .L02: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 @@ -316,7 +316,7 @@ decl %eax jne .L06 ALIGN_4 - + .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -337,7 +337,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -360,7 +360,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -375,7 +375,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -677,7 +677,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -691,7 +691,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -867,7 +867,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -881,7 +881,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1046,14 +1046,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L45 ALIGN_4 - + .L42: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 @@ -1098,7 +1098,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1119,7 +1119,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1138,7 +1138,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1298,7 +1298,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1314,7 +1314,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1446,7 +1446,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1462,7 +1462,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1590,7 +1590,7 @@ addl $2, KK #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C ALIGN_4 .L80: @@ -1600,14 +1600,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 - + .L82: movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 @@ -1661,7 +1661,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1682,7 +1682,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1702,7 +1702,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1828,7 +1828,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1847,7 +1847,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1971,7 +1971,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1987,7 +1987,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/gemm_kernel_8x1_sse2.S b/kernel/x86/gemm_kernel_8x1_sse2.S index 52a9ebc9c..fbeef0f11 100644 --- a/kernel/x86/gemm_kernel_8x1_sse2.S +++ b/kernel/x86/gemm_kernel_8x1_sse2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -216,7 +216,7 @@ movl %eax, J jle .L999 ALIGN_2 - + .L01: /* Copying to Sub Buffer */ movl K, %eax @@ -224,7 +224,7 @@ sarl $3, %eax jle .L03 ALIGN_4 - + .L02: prefetchnta 96 * SIZE(B) @@ -279,7 +279,7 @@ decl %eax jne .L04 ALIGN_4 - + .L05: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -306,7 +306,7 @@ andl $-8, %eax leal (, %eax, 8), %eax je .L12 - + KERNELMACRO(32 * 0) # 0 cmpl $64 * 1, %eax jle .L11 @@ -372,7 +372,7 @@ #define PRE 40 -.L11: +.L11: mulpd %xmm0, %xmm1 movd (PRE + 0) * SIZE(AA), %mm0 addpd %xmm1, %xmm4 @@ -544,7 +544,7 @@ BRANCH decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L20: movl M, %ebx @@ -567,7 +567,7 @@ sarl $3, %eax je .L22 -.L21: +.L21: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 @@ -690,7 +690,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: movapd 0 * SIZE(BB), %xmm0 movapd 0 * SIZE(AA), %xmm1 mulpd %xmm0, %xmm1 @@ -791,7 +791,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: movsd 0 * SIZE(AA), %xmm0 mulsd 0 * SIZE(BB), %xmm0 addsd %xmm0, %xmm4 diff --git a/kernel/x86/gemm_kernel_8x2_core2.S b/kernel/x86/gemm_kernel_8x2_core2.S index 3fd8c566d..6e2edc48f 100644 --- a/kernel/x86/gemm_kernel_8x2_core2.S +++ b/kernel/x86/gemm_kernel_8x2_core2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -114,7 +114,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif shufps $0, %xmm3, %xmm3 @@ -142,13 +142,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movss -32 * SIZE(B), %xmm0 @@ -207,7 +207,7 @@ decl %eax jne .L06 ALIGN_4 - + .L10: movl C, C1 movl A, AA @@ -244,7 +244,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -494,7 +494,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -662,7 +662,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -825,7 +825,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -977,13 +977,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $3, %eax jle .L55 ALIGN_4 - + .L52: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 @@ -1035,7 +1035,7 @@ decl %eax jne .L56 ALIGN_4 - + .L60: movl C, C1 movl A, AA @@ -1073,7 +1073,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1240,7 +1240,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1379,7 +1379,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1513,7 +1513,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1610,7 +1610,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/gemm_kernel_8x2_sse.S b/kernel/x86/gemm_kernel_8x2_sse.S index c3897646b..f855263e7 100644 --- a/kernel/x86/gemm_kernel_8x2_sse.S +++ b/kernel/x86/gemm_kernel_8x2_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -218,7 +218,7 @@ addl $STACK_OFFSET, %esp STACK_TOUCHING - + movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 @@ -247,7 +247,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif leal (, LDC, SIZE), LDC @@ -256,12 +256,12 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -269,7 +269,7 @@ sarl $2, %eax jle .L03 ALIGN_4 - + .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 @@ -299,7 +299,7 @@ movaps %xmm7, 28 * SIZE(%ecx) prefetcht0 104 * SIZE(B) - + addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax @@ -367,7 +367,7 @@ xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif prefetchnta 7 * SIZE(%esi) prefetchnta 7 * SIZE(%esi, %ebp) @@ -377,7 +377,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -391,7 +391,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -518,7 +518,7 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif prefetchnta 8 * SIZE(%esi) prefetchnta 8 * SIZE(%esi, %ebp) @@ -528,7 +528,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -791,7 +791,7 @@ BRANCH decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: movl M, %ebx @@ -830,14 +830,14 @@ xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -936,14 +936,14 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1086,7 +1086,7 @@ #endif addl $4 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L50: testl $2, %ebx @@ -1122,14 +1122,14 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1236,14 +1236,14 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1385,7 +1385,7 @@ #endif addl $2 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L70: testl $1, %ebx @@ -1420,14 +1420,14 @@ xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1526,14 +1526,14 @@ xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1658,7 +1658,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addl $1, KK #endif - ALIGN_2 + ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1677,12 +1677,12 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -1690,10 +1690,10 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: prefetchnta 96 * SIZE(B) - + movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 @@ -1785,14 +1785,14 @@ xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1892,14 +1892,14 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2045,7 +2045,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -2084,14 +2084,14 @@ xorps %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2168,14 +2168,14 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2278,7 +2278,7 @@ addl $4, KK #endif addl $4 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L150: testl $2, %ebx @@ -2313,14 +2313,14 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2403,14 +2403,14 @@ xorps %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2518,7 +2518,7 @@ addl $2, KK #endif addl $2 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L170: testl $1, %ebx @@ -2553,14 +2553,14 @@ xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2637,14 +2637,14 @@ xorps %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2728,7 +2728,7 @@ addss 0 * SIZE(%esi), %xmm4 #endif movss %xmm4, 0 * SIZE(%esi) - ALIGN_2 + ALIGN_2 .L999: movl OLD_STACK, %esp diff --git a/kernel/x86/gemm_ncopy_2.S b/kernel/x86/gemm_ncopy_2.S index a2674c749..e8288255b 100644 --- a/kernel/x86/gemm_ncopy_2.S +++ b/kernel/x86/gemm_ncopy_2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 8 - + #define J 0 + STACK(%esp) #define M 4 + STACK + ARGS(%esp) diff --git a/kernel/x86/gemm_ncopy_2_sse.S b/kernel/x86/gemm_ncopy_2_sse.S index 1a8262c96..7a6613d6e 100644 --- a/kernel/x86/gemm_ncopy_2_sse.S +++ b/kernel/x86/gemm_ncopy_2_sse.S @@ -46,7 +46,7 @@ #define STACK 16 #define ARGS 0 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define ARG_A 12 + STACK + ARGS(%esp) @@ -60,7 +60,7 @@ #define A2 %edx #define I %esi #define J %edi - + PROLOGUE pushl %ebp diff --git a/kernel/x86/gemm_ncopy_4_sse.S b/kernel/x86/gemm_ncopy_4_sse.S index 3e919b26b..4c26b955a 100644 --- a/kernel/x86/gemm_ncopy_4_sse.S +++ b/kernel/x86/gemm_ncopy_4_sse.S @@ -46,7 +46,7 @@ #define STACK 16 #define ARGS 0 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define ARG_A 12 + STACK + ARGS(%esp) @@ -60,7 +60,7 @@ #define A2 %edx #define I %esi #define J %edi - + PROLOGUE pushl %ebp diff --git a/kernel/x86/gemm_tcopy_2.S b/kernel/x86/gemm_tcopy_2.S index 61b775475..3d862b605 100644 --- a/kernel/x86/gemm_tcopy_2.S +++ b/kernel/x86/gemm_tcopy_2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 8 - + #define J 0 + STACK(%esp) #define BOFFSET2 4 + STACK(%esp) @@ -60,7 +60,7 @@ pushl %ebx PROFCODE - + EMMS movl A, %ebp diff --git a/kernel/x86/gemm_tcopy_2_sse.S b/kernel/x86/gemm_tcopy_2_sse.S index de5f4ffe2..3a5b7c65f 100644 --- a/kernel/x86/gemm_tcopy_2_sse.S +++ b/kernel/x86/gemm_tcopy_2_sse.S @@ -46,7 +46,7 @@ #define STACK 16 #define ARGS 8 - + #define J 0 + STACK(%esp) #define BOFFSET2 4 + STACK(%esp) @@ -65,7 +65,7 @@ pushl %ebx PROFCODE - + movl A, %ebp movl B, %edi diff --git a/kernel/x86/gemm_tcopy_4_sse.S b/kernel/x86/gemm_tcopy_4_sse.S index 4e1e2e661..bc84f6951 100644 --- a/kernel/x86/gemm_tcopy_4_sse.S +++ b/kernel/x86/gemm_tcopy_4_sse.S @@ -46,7 +46,7 @@ #define STACK 16 #define ARGS 0 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define ARG_A 12 + STACK + ARGS(%esp) @@ -60,7 +60,7 @@ #define A2 %edx #define I %esi #define J %edi - + PROLOGUE pushl %ebp diff --git a/kernel/x86/gemv_n.S b/kernel/x86/gemv_n.S index 652c0bb0f..53dfd4ec7 100644 --- a/kernel/x86/gemv_n.S +++ b/kernel/x86/gemv_n.S @@ -53,7 +53,7 @@ #define STACK 16 #define ARGS 16 - + #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) @@ -190,7 +190,7 @@ ALIGN_2 .L48: - movl A, %edx # a_offset = a + movl A, %edx # a_offset = a fldz addl $4 * SIZE, A # a += 4 fldz @@ -261,7 +261,7 @@ FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 - + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 @@ -279,7 +279,7 @@ FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 - + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 @@ -306,7 +306,7 @@ FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 - + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 diff --git a/kernel/x86/gemv_n_atom.S b/kernel/x86/gemv_n_atom.S index e88409ce2..f30074071 100644 --- a/kernel/x86/gemv_n_atom.S +++ b/kernel/x86/gemv_n_atom.S @@ -57,7 +57,7 @@ #define Y 40 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE(%esp) - + #define I %eax #define J %ebx @@ -93,7 +93,7 @@ jle .L999 movl BUFFER, Y1 - + pxor %xmm7, %xmm7 movl M, %eax @@ -767,7 +767,7 @@ .L999: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp ret diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index f3a388ffd..3c77a2aba 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -105,7 +105,7 @@ #define MMM 0+ARGS(%esp) #define YY 4+ARGS(%esp) #define AA 8+ARGS(%esp) - + #define I %eax #define J %ebx @@ -169,7 +169,7 @@ jle .L999 movl BUFFER, Y1 - + xorps %xmm7, %xmm7 movl M, %eax @@ -697,7 +697,7 @@ .L999x: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp ret diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index eeb3c259d..2b2cc6455 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -92,7 +92,7 @@ #define MMM 0+ARGS(%esp) #define YY 4+ARGS(%esp) #define AA 8+ARGS(%esp) - + #define I %eax #define J %ebx @@ -157,7 +157,7 @@ jle .L999 movl BUFFER, Y1 - + pxor %xmm7, %xmm7 movl M, %eax @@ -724,7 +724,7 @@ popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp ret diff --git a/kernel/x86/gemv_t.S b/kernel/x86/gemv_t.S index 2eecd3fff..0d2a251dc 100644 --- a/kernel/x86/gemv_t.S +++ b/kernel/x86/gemv_t.S @@ -49,7 +49,7 @@ #define STACK 16 #define ARGS 24 - + #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) @@ -305,7 +305,7 @@ addl $4 * SIZE, %esi #else - + #if defined(HAS_PREFETCH) prefetcht0 PRESIZE * SIZE(%ebx) prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) diff --git a/kernel/x86/gemv_t_atom.S b/kernel/x86/gemv_t_atom.S index a21416d49..43ff0f098 100644 --- a/kernel/x86/gemv_t_atom.S +++ b/kernel/x86/gemv_t_atom.S @@ -57,7 +57,7 @@ #define Y 40 + STACKSIZE(%esp) #define STACK_INCY 44 + STACKSIZE(%esp) #define BUFFER 48 + STACKSIZE(%esp) - + #define I %eax #define J %ebx @@ -95,7 +95,7 @@ jle .L999 movl BUFFER, Y1 - + movl M, I sarl $3, I jle .L05 @@ -365,7 +365,7 @@ addsd %xmm6, %xmm0 addsd %xmm7, %xmm1 - + addl $4 * SIZE, A1 addl $4 * SIZE, X ALIGN_4 @@ -582,7 +582,7 @@ mulsd %xmm3, %xmm5 addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 - + addl $2 * SIZE, A1 ALIGN_4 @@ -605,11 +605,11 @@ movsd %xmm0, (Y1) ALIGN_4 - + .L999: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp ret diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 48193f142..2c927aa33 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -105,7 +105,7 @@ #define MMM 0+ARGS(%esp) #define AA 4+ARGS(%esp) #define XX 8+ARGS(%esp) - + #define I %eax #define J %ebx @@ -139,10 +139,10 @@ sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) subl $8, J # Don't use last 8 float in the buffer. subl J,MMM # MMM=MMM-J - movl J,M + movl J,M jge .L00t ALIGN_4 - + movl MMM,%eax addl J,%eax jle .L999x @@ -171,7 +171,7 @@ jle .L999 movl BUFFER, Y1 - + movl M, I sarl $3, I jle .L05 @@ -423,7 +423,7 @@ mulps %xmm2, %xmm5 addps %xmm5, %xmm1 movaps %xmm3, %xmm2 - + addl $4 * SIZE, A1 ALIGN_4 @@ -446,7 +446,7 @@ mulps %xmm2, %xmm5 addps %xmm5, %xmm1 movhlps %xmm2, %xmm2 - + addl $2 * SIZE, A1 ALIGN_4 @@ -621,7 +621,7 @@ mulps %xmm2, %xmm4 addps %xmm4, %xmm0 movaps %xmm3, %xmm2 - + addl $4 * SIZE, A1 ALIGN_4 @@ -637,7 +637,7 @@ mulps %xmm2, %xmm4 addps %xmm4, %xmm0 movhlps %xmm2, %xmm2 - + addl $2 * SIZE, A1 ALIGN_4 @@ -673,7 +673,7 @@ movss %xmm0, (Y1) ALIGN_4 - + .L999: movl M,J leal (,J,SIZE),%eax @@ -687,7 +687,7 @@ .L999x: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index 75ed89a6f..b94723a8c 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -128,10 +128,10 @@ sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) subl $4, J # Don't use last 4 double in the buffer. subl J,MMM # MMM=MMM-J - movl J,M + movl J,M jge .L00t ALIGN_4 - + movl MMM,%eax addl J,%eax jle .L999x @@ -161,7 +161,7 @@ jle .L999 movl BUFFER, Y1 - + movl M, I sarl $3, I jle .L05 @@ -391,7 +391,7 @@ mulpd %xmm2, %xmm5 addpd %xmm5, %xmm1 movapd %xmm3, %xmm2 - + addl $2 * SIZE, A1 ALIGN_4 @@ -562,7 +562,7 @@ mulpd %xmm2, %xmm4 addpd %xmm4, %xmm0 movapd %xmm3, %xmm2 - + addl $2 * SIZE, A1 ALIGN_4 @@ -594,7 +594,7 @@ movlpd %xmm0, (Y1) ALIGN_4 - + .L999: movl M,J leal (,J,SIZE),%eax @@ -608,7 +608,7 @@ .L999x: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp diff --git a/kernel/x86/iamax.S b/kernel/x86/iamax.S index 33204c07e..1a7378474 100644 --- a/kernel/x86/iamax.S +++ b/kernel/x86/iamax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -101,7 +101,7 @@ FLD (X) #ifdef USE_ABS - fabs + fabs #endif addl INCX, X decl M @@ -114,7 +114,7 @@ sarl $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -122,7 +122,7 @@ FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -132,7 +132,7 @@ FLD 1 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -142,7 +142,7 @@ FLD 2 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -152,7 +152,7 @@ FLD 3 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -162,7 +162,7 @@ FLD 4 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -172,7 +172,7 @@ FLD 5 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -182,7 +182,7 @@ FLD 6 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -192,7 +192,7 @@ FLD 7 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -216,7 +216,7 @@ .L21: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -235,12 +235,12 @@ sarl $3, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -251,7 +251,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -262,7 +262,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -273,7 +273,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -284,7 +284,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -295,7 +295,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -306,7 +306,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -317,7 +317,7 @@ FLD 0 * SIZE(X) addl INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) @@ -339,7 +339,7 @@ .L61: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi %st(1), %st FMOV %st(1), %st(0) diff --git a/kernel/x86/iamax_sse.S b/kernel/x86/iamax_sse.S index 3b64ebdac..dcd62bfc3 100644 --- a/kernel/x86/iamax_sse.S +++ b/kernel/x86/iamax_sse.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -54,7 +54,7 @@ #define MM %ebp #define XX %edi #define TEMP %ebx - + #ifdef USE_MIN #define maxps minps #define maxss minss @@ -163,7 +163,7 @@ sarl $4, I jle .L15 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -229,7 +229,7 @@ #endif maxps %xmm4, %xmm2 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L17: testl $2, MM @@ -242,7 +242,7 @@ #endif maxps %xmm4, %xmm3 addl $2 * SIZE, XX - + .L18: testl $1, MM je .L20 @@ -312,7 +312,7 @@ sarl $3, I jle .L25 ALIGN_4 - + .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -418,7 +418,7 @@ incl RET comiss %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L26: testl $2, MM @@ -438,7 +438,7 @@ comiss %xmm0, %xmm2 je .L999 ALIGN_3 - + .L27: incl RET jmp .L999 @@ -450,7 +450,7 @@ sarl $4, I jle .L35 ALIGN_4 - + .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -524,7 +524,7 @@ #endif maxps %xmm4, %xmm2 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L37: testl $2, MM @@ -537,7 +537,7 @@ #endif maxps %xmm4, %xmm3 addl $2 * SIZE, XX - + .L38: testl $1, MM je .L40 @@ -569,7 +569,7 @@ sarl $3, I jle .L45 ALIGN_4 - + .L43: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -677,7 +677,7 @@ incl RET comiss %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L46: testl $2, MM @@ -697,7 +697,7 @@ comiss %xmm0, %xmm2 je .L999 ALIGN_3 - + .L47: incl RET jmp .L999 @@ -708,7 +708,7 @@ sarl $3, I jle .L85 ALIGN_4 - + .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -808,7 +808,7 @@ andps %xmm7, %xmm4 #endif maxss %xmm4, %xmm3 - ALIGN_3 + ALIGN_3 .L86: testl $2, MM @@ -828,7 +828,7 @@ #endif maxss %xmm4, %xmm1 ALIGN_3 - + .L87: testl $1, MM je .L90 @@ -854,7 +854,7 @@ sarl $2, I jle .L96 ALIGN_4 - + .L92: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -932,7 +932,7 @@ incl RET comiss %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L96: testl $2, MM @@ -953,7 +953,7 @@ comiss %xmm0, %xmm2 je .L999 ALIGN_3 - + .L97: incl RET ALIGN_3 diff --git a/kernel/x86/iamax_sse2.S b/kernel/x86/iamax_sse2.S index a0ddb26dd..caa6fc87b 100644 --- a/kernel/x86/iamax_sse2.S +++ b/kernel/x86/iamax_sse2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -54,7 +54,7 @@ #define MM %ebp #define XX %edi #define TEMP %ebx - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd @@ -137,7 +137,7 @@ sarl $4, I jle .L15 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -249,7 +249,7 @@ #endif maxpd %xmm4, %xmm1 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L17: testl $2, MM @@ -261,7 +261,7 @@ #endif maxpd %xmm4, %xmm2 addl $2 * SIZE, XX - + .L18: testl $1, MM je .L20 @@ -306,7 +306,7 @@ sarl $3, I jle .L25 ALIGN_4 - + .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -425,7 +425,7 @@ incl RET comisd %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L27: testl $2, MM @@ -445,7 +445,7 @@ comisd %xmm0, %xmm2 je .L999 ALIGN_3 - + .L28: incl RET jmp .L999 @@ -584,7 +584,7 @@ #endif maxpd %xmm4, %xmm1 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L57: testl $2, MM @@ -597,7 +597,7 @@ #endif maxpd %xmm4, %xmm2 addl $2 * SIZE, XX - + .L58: testl $1, MM je .L60 @@ -626,7 +626,7 @@ sarl $3, I jle .L65 ALIGN_4 - + .L62: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -704,13 +704,13 @@ movsd 4 * SIZE(XX), %xmm1 movsd 5 * SIZE(XX), %xmm2 movsd 6 * SIZE(XX), %xmm3 - + #ifdef USE_ABS andpd %xmm7, %xmm1 andpd %xmm7, %xmm2 andpd %xmm7, %xmm3 #endif - + comisd %xmm0, %xmm1 je .L999 incl RET @@ -750,7 +750,7 @@ incl RET comisd %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L67: testl $2, MM @@ -770,7 +770,7 @@ comisd %xmm0, %xmm2 je .L999 ALIGN_3 - + .L68: incl RET jmp .L999 @@ -781,7 +781,7 @@ sarl $4, I jle .L85 ALIGN_4 - + .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -932,7 +932,7 @@ andpd %xmm7, %xmm4 #endif maxpd %xmm4, %xmm1 - ALIGN_3 + ALIGN_3 .L87: testl $2, MM @@ -947,7 +947,7 @@ #endif maxpd %xmm4, %xmm2 ALIGN_3 - + .L88: testl $1, MM je .L90 @@ -976,7 +976,7 @@ sarl $3, I jle .L95 ALIGN_4 - + .L92: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -1116,7 +1116,7 @@ incl RET comisd %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L97: testl $2, MM @@ -1137,7 +1137,7 @@ comisd %xmm0, %xmm2 je .L999 ALIGN_3 - + .L98: incl RET ALIGN_3 diff --git a/kernel/x86/izamax.S b/kernel/x86/izamax.S index 63bcaef14..de324ad7e 100644 --- a/kernel/x86/izamax.S +++ b/kernel/x86/izamax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -99,9 +99,9 @@ movl $1, RET FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) addl INCX, X decl M @@ -114,16 +114,16 @@ sarl $2, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) @@ -132,9 +132,9 @@ incl NUM FLD 2 * SIZE(X) - fabs + fabs FLD 3 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) @@ -143,9 +143,9 @@ incl NUM FLD 4 * SIZE(X) - fabs + fabs FLD 5 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) @@ -154,9 +154,9 @@ incl NUM FLD 6 * SIZE(X) - fabs + fabs FLD 7 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) @@ -178,9 +178,9 @@ .L21: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) @@ -199,12 +199,12 @@ sarl $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -214,9 +214,9 @@ incl NUM FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -226,9 +226,9 @@ incl NUM FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -238,9 +238,9 @@ incl NUM FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -262,9 +262,9 @@ .L61: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) diff --git a/kernel/x86/izamax_sse.S b/kernel/x86/izamax_sse.S index 95223fe56..eed58be33 100644 --- a/kernel/x86/izamax_sse.S +++ b/kernel/x86/izamax_sse.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -54,12 +54,12 @@ #define MM %ebp #define XX %edi #define TEMP %ebx - + #ifdef USE_MIN #define maxps minps #define maxss minss #endif - + #ifndef HAVE_SSE2 #define pxor xorps #define movsd movlps @@ -126,7 +126,7 @@ sarl $3, I jle .L35 ALIGN_4 - + .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -209,7 +209,7 @@ maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L37: testl $1, MM @@ -239,7 +239,7 @@ sarl $2, I jle .L45 ALIGN_4 - + .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -338,7 +338,7 @@ incl RET comiss %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L47: incl RET @@ -350,7 +350,7 @@ sarl $3, I jle .L75 ALIGN_4 - + .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -422,7 +422,7 @@ andps %xmm7, %xmm3 addps %xmm3, %xmm1 maxps %xmm1, %xmm0 - ALIGN_3 + ALIGN_3 .L76: testl $2, MM @@ -443,7 +443,7 @@ maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 ALIGN_3 - + .L77: testl $1, MM je .L80 @@ -472,7 +472,7 @@ sarl $2, I jle .L85 ALIGN_4 - + .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -580,7 +580,7 @@ incl RET comiss %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L87: incl RET diff --git a/kernel/x86/izamax_sse2.S b/kernel/x86/izamax_sse2.S index 0392e1d2e..d9e7a8bf0 100644 --- a/kernel/x86/izamax_sse2.S +++ b/kernel/x86/izamax_sse2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -54,7 +54,7 @@ #define MM %ebp #define XX %edi #define TEMP %ebx - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd @@ -209,7 +209,7 @@ maxpd %xmm1, %xmm0 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L27: testl $1, MM @@ -341,7 +341,7 @@ incl RET comisd %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L36: incl RET @@ -353,7 +353,7 @@ sarl $3, I jle .L65 ALIGN_4 - + .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -462,7 +462,7 @@ andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 - ALIGN_3 + ALIGN_3 .L67: testl $1, MM @@ -603,7 +603,7 @@ incl RET comisd %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L76: incl RET diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S index c0982496a..7a14da862 100644 --- a/kernel/x86/nrm2.S +++ b/kernel/x86/nrm2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -49,7 +49,7 @@ #define M %edx #define X %ecx #define INCX %esi - + #define I %eax #include "l1param.h" @@ -91,7 +91,7 @@ sarl $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -153,7 +153,7 @@ sarl $3, I jle .L60 ALIGN_4 - + .L50: FLD (X) addl INCX, X diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S index e70460912..0f174c408 100644 --- a/kernel/x86/nrm2_sse.S +++ b/kernel/x86/nrm2_sse.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -49,7 +49,7 @@ #define M %edx #define X %ecx #define INCX %esi - + #define I %eax #include "l1param.h" @@ -79,7 +79,7 @@ testl $SIZE, X je .L05 - + movss -32 * SIZE(X), %xmm0 cvtss2sd %xmm0, %xmm0 mulsd %xmm0, %xmm0 @@ -93,7 +93,7 @@ movl M, I sarl $4, I jle .L13 - + movsd -32 * SIZE(X), %xmm4 movsd -30 * SIZE(X), %xmm5 movsd -28 * SIZE(X), %xmm6 @@ -267,7 +267,7 @@ sarl $3, I jle .L44 ALIGN_4 - + .L41: movss (X), %xmm4 addl INCX, X diff --git a/kernel/x86/qaxpy.S b/kernel/x86/qaxpy.S index 0497ea323..6298e4080 100644 --- a/kernel/x86/qaxpy.S +++ b/kernel/x86/qaxpy.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA 16 + STACK + ARGS(%esp) #define STACK_X 32 + STACK + ARGS(%esp) @@ -154,7 +154,7 @@ #ifdef HAVE_3DNOW prefetchw 24 * SIZE(Y) #endif - + addl $8 * SIZE, X addl $8 * SIZE, Y decl %eax diff --git a/kernel/x86/qdot.S b/kernel/x86/qdot.S index ce5ff29f1..21665ced9 100644 --- a/kernel/x86/qdot.S +++ b/kernel/x86/qdot.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) diff --git a/kernel/x86/qgemm_kernel_2x2.S b/kernel/x86/qgemm_kernel_2x2.S index a2852f2e1..55748b18e 100644 --- a/kernel/x86/qgemm_kernel_2x2.S +++ b/kernel/x86/qgemm_kernel_2x2.S @@ -50,7 +50,7 @@ #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -90,13 +90,13 @@ negl %eax movl %eax, KK #endif - + movl ARG_LDC, LDC movl ARG_B, B addl $8 * SIZE, A addl $8 * SIZE, B - + sall $BASE_SHIFT, LDC movl N, %eax @@ -109,7 +109,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl A, AO @@ -132,7 +132,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO -#endif +#endif fldz fldz @@ -152,7 +152,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -178,7 +178,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -196,7 +196,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -216,7 +216,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -234,7 +234,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -270,7 +270,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -351,7 +351,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 2), BO -#endif +#endif fldz fldz @@ -361,7 +361,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -500,13 +500,13 @@ .L30: movl N, %eax - testl $1, %eax + testl $1, %eax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl A, AO @@ -528,7 +528,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal ( B, %eax, 1), BO -#endif +#endif fldz fldz @@ -544,7 +544,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -685,7 +685,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 1), BO -#endif +#endif fldz @@ -694,7 +694,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/qgemv_n.S b/kernel/x86/qgemv_n.S index e33bce2df..1d1ca4726 100644 --- a/kernel/x86/qgemv_n.S +++ b/kernel/x86/qgemv_n.S @@ -53,7 +53,7 @@ #define STACK 16 #define ARGS 16 - + #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) @@ -184,7 +184,7 @@ ALIGN_2 .L48: - movl A, %edx # a_offset = a + movl A, %edx # a_offset = a fldz addl $4 * SIZE, A # a += 4 fldz @@ -255,7 +255,7 @@ FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 - + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 @@ -274,7 +274,7 @@ FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 - + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 @@ -302,7 +302,7 @@ FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 - + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 diff --git a/kernel/x86/qgemv_t.S b/kernel/x86/qgemv_t.S index ff2ba80c4..f5a77fc89 100644 --- a/kernel/x86/qgemv_t.S +++ b/kernel/x86/qgemv_t.S @@ -49,7 +49,7 @@ #define STACK 16 #define ARGS 24 - + #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) @@ -299,7 +299,7 @@ addl $4 * SIZE, %esi #else - + #if defined(HAS_PREFETCH) prefetcht0 PRESIZE * SIZE(%ebx) prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) diff --git a/kernel/x86/qtrsm_kernel_LN_2x2.S b/kernel/x86/qtrsm_kernel_LN_2x2.S index 37c268b41..749dec447 100644 --- a/kernel/x86/qtrsm_kernel_LN_2x2.S +++ b/kernel/x86/qtrsm_kernel_LN_2x2.S @@ -50,7 +50,7 @@ #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -117,7 +117,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl N, %eax @@ -159,7 +159,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -186,7 +186,7 @@ leal (B, %eax, 2), BO #else movl B, BO -#endif +#endif fldz fldz @@ -409,7 +409,7 @@ leal (B, %eax, 2), BO #else movl B, BO -#endif +#endif fldz fldz @@ -446,7 +446,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -464,7 +464,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -484,7 +484,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -502,7 +502,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -539,7 +539,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -758,7 +758,7 @@ .L30: movl N, %eax - testl $1, %eax + testl $1, %eax je .L999 #if defined(LT) || defined(RN) @@ -786,7 +786,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -813,7 +813,7 @@ leal (B, %eax, 1), BO #else movl B, BO -#endif +#endif fldz @@ -988,7 +988,7 @@ leal (B, %eax, 1), BO #else movl B, BO -#endif +#endif fldz fldz diff --git a/kernel/x86/qtrsm_kernel_LT_2x2.S b/kernel/x86/qtrsm_kernel_LT_2x2.S index 157e12d7f..10c398692 100644 --- a/kernel/x86/qtrsm_kernel_LT_2x2.S +++ b/kernel/x86/qtrsm_kernel_LT_2x2.S @@ -50,7 +50,7 @@ #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define AORIG 8 + STACK(%esp) @@ -115,7 +115,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl N, %eax @@ -157,7 +157,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -184,7 +184,7 @@ leal (B, %eax, 2), BO #else movl B, BO -#endif +#endif fldz fldz @@ -221,7 +221,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -239,7 +239,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -259,7 +259,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -277,7 +277,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -314,7 +314,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -529,7 +529,7 @@ leal (B, %eax, 2), BO #else movl B, BO -#endif +#endif fldz fldz @@ -756,7 +756,7 @@ .L30: movl N, %eax - testl $1, %eax + testl $1, %eax je .L999 #if defined(LT) || defined(RN) @@ -784,7 +784,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -811,7 +811,7 @@ leal (B, %eax, 1), BO #else movl B, BO -#endif +#endif fldz fldz @@ -1044,7 +1044,7 @@ leal (B, %eax, 1), BO #else movl B, BO -#endif +#endif fldz diff --git a/kernel/x86/qtrsm_kernel_RT_2x2.S b/kernel/x86/qtrsm_kernel_RT_2x2.S index a0a4dafe3..3a000766c 100644 --- a/kernel/x86/qtrsm_kernel_RT_2x2.S +++ b/kernel/x86/qtrsm_kernel_RT_2x2.S @@ -50,7 +50,7 @@ #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -117,7 +117,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl N, %eax @@ -126,7 +126,7 @@ #endif movl N, %eax - testl $1, %eax + testl $1, %eax je .L30 #if defined(LT) || defined(RN) @@ -154,7 +154,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -181,7 +181,7 @@ leal (B, %eax, 1), BO #else movl B, BO -#endif +#endif fldz fldz @@ -414,7 +414,7 @@ leal (B, %eax, 1), BO #else movl B, BO -#endif +#endif fldz @@ -623,7 +623,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -650,7 +650,7 @@ leal (B, %eax, 2), BO #else movl B, BO -#endif +#endif fldz fldz @@ -687,7 +687,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -705,7 +705,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -725,7 +725,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -743,7 +743,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -780,7 +780,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -995,7 +995,7 @@ leal (B, %eax, 2), BO #else movl B, BO -#endif +#endif fldz fldz diff --git a/kernel/x86/rot.S b/kernel/x86/rot.S index 111266a72..8448bee8c 100644 --- a/kernel/x86/rot.S +++ b/kernel/x86/rot.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -110,7 +110,7 @@ sarl $2, I jle .L15 ALIGN_4 - + .L10: #ifdef PENTIUM4 PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) @@ -248,7 +248,7 @@ sarl $2, I jle .L55 ALIGN_4 - + .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) diff --git a/kernel/x86/rot_sse.S b/kernel/x86/rot_sse.S index af9f12f62..9495bcd2a 100644 --- a/kernel/x86/rot_sse.S +++ b/kernel/x86/rot_sse.S @@ -76,8 +76,8 @@ movl STACK_Y, Y movl STACK_INCY, INCY - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY movss STACK_C, C movss STACK_S, S @@ -434,7 +434,7 @@ movaps %xmm0, 12 * SIZE(X) movlps %xmm2, 12 * SIZE(Y) movhps %xmm2, 14 * SIZE(Y) - + addl $16 * SIZE, X addl $16 * SIZE, Y ALIGN_3 diff --git a/kernel/x86/rot_sse2.S b/kernel/x86/rot_sse2.S index e9c5ba1ef..83931de6c 100644 --- a/kernel/x86/rot_sse2.S +++ b/kernel/x86/rot_sse2.S @@ -76,8 +76,8 @@ movl STACK_Y, Y movl STACK_INCY, INCY - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY movsd STACK_C, C movsd STACK_S, S diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S index 48edfc585..53eba7764 100644 --- a/kernel/x86/scal_sse.S +++ b/kernel/x86/scal_sse.S @@ -280,7 +280,7 @@ movaps %xmm0, %xmm4 mulps -20 * SIZE(X), %xmm4 - decl I + decl I jle .L112 ALIGN_4 @@ -353,13 +353,13 @@ movaps %xmm4, -4 * SIZE(X) #else - + movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 movaps -20 * SIZE(X), %xmm4 - decl I + decl I jle .L112 ALIGN_4 diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S index 35b79132c..a278ecbda 100644 --- a/kernel/x86/scal_sse2.S +++ b/kernel/x86/scal_sse2.S @@ -77,7 +77,7 @@ comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO jp .L100 # For Alpha = NaN - + /* Alpha == ZERO */ cmpl $SIZE, INCX jne .L50 @@ -264,7 +264,7 @@ movaps %xmm0, %xmm4 mulpd -10 * SIZE(X), %xmm4 - decl I + decl I jle .L112 ALIGN_4 @@ -342,7 +342,7 @@ movaps -12 * SIZE(X), %xmm3 movaps -10 * SIZE(X), %xmm4 - decl I + decl I jle .L112 ALIGN_4 diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S index d32c1a3c8..54b00b33e 100644 --- a/kernel/x86/swap.S +++ b/kernel/x86/swap.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define N 4 + STACK + ARGS(%esp) #ifdef XDOUBLE #define X 32 + STACK + ARGS(%esp) diff --git a/kernel/x86/swap_sse.S b/kernel/x86/swap_sse.S index 39c0d2f0b..e6cd4ada7 100644 --- a/kernel/x86/swap_sse.S +++ b/kernel/x86/swap_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 20 + STACK + ARGS(%esp) #define STACK_INCX 24 + STACK + ARGS(%esp) @@ -53,7 +53,7 @@ #define Y %edi #define INCX %ebx #define INCY %ecx - + #include "l1param.h" PROLOGUE @@ -80,7 +80,7 @@ subl $-32 * SIZE, X subl $-32 * SIZE, Y - + cmpl $3, M jle .L16 @@ -302,7 +302,7 @@ .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) PSHUFD2($0x39, %xmm1, %xmm3) movlps %xmm3, -31 * SIZE(X) @@ -778,7 +778,7 @@ .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) subl $3, M diff --git a/kernel/x86/swap_sse2.S b/kernel/x86/swap_sse2.S index b8808125f..9a3576c17 100644 --- a/kernel/x86/swap_sse2.S +++ b/kernel/x86/swap_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) @@ -96,7 +96,7 @@ .L10: subl $-16 * SIZE, X subl $-16 * SIZE, Y - + testl $SIZE, X jne .L20 diff --git a/kernel/x86/trsm_kernel_LN_2x2.S b/kernel/x86/trsm_kernel_LN_2x2.S index d1c741b09..587739ca4 100644 --- a/kernel/x86/trsm_kernel_LN_2x2.S +++ b/kernel/x86/trsm_kernel_LN_2x2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -72,7 +72,7 @@ #else #define REP rep #endif - + #define AA %edx #define BB %ecx @@ -112,7 +112,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -121,7 +121,7 @@ #endif movl N, %eax # j = (n >> 1) # MEMORY - sarl $1, %eax + sarl $1, %eax movl %eax, J # j = (n >> 1) # MEMORY je .L8 ALIGN_4 @@ -153,7 +153,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -178,7 +178,7 @@ leal (%ebx, %eax, 2), BB #else movl %ebx, BB -#endif +#endif fldz fldz @@ -371,7 +371,7 @@ leal (%ebx, %eax, 2), BB #else movl %ebx, BB -#endif +#endif fldz fldz @@ -741,7 +741,7 @@ movl N, %eax # n # MEMORY andl $1, %eax je .End - + #if defined(LT) || defined(RN) movl A, AA #else @@ -767,7 +767,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -792,7 +792,7 @@ leal (%ebx, %eax, 1), BB #else movl %ebx, BB -#endif +#endif fldz @@ -905,7 +905,7 @@ sarl $1, %esi # m >> 1 je .L99 ALIGN_4 - + .L46: #ifdef LN movl K, %eax @@ -921,7 +921,7 @@ leal (%ebx, %eax, 1), BB #else movl %ebx, BB -#endif +#endif fldz fldz diff --git a/kernel/x86/trsm_kernel_LN_2x2_atom.S b/kernel/x86/trsm_kernel_LN_2x2_atom.S index 846a84858..7624fde8c 100644 --- a/kernel/x86/trsm_kernel_LN_2x2_atom.S +++ b/kernel/x86/trsm_kernel_LN_2x2_atom.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -83,7 +83,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -146,7 +146,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -175,7 +175,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -395,7 +395,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -724,7 +724,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -753,7 +753,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -922,7 +922,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 16ba9a0e3..0b475afa2 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -95,7 +95,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -161,7 +161,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -190,7 +190,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 @@ -495,7 +495,7 @@ #endif ALIGN_4 -.L20: +.L20: movl M, %ebx sarl $1, %ebx jle .L29 @@ -521,7 +521,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif leal (CO1, LDC, 2), %eax @@ -1006,7 +1006,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1035,7 +1035,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 @@ -1278,7 +1278,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1611,7 +1611,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1640,7 +1640,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 @@ -1827,7 +1827,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index b1dea62a7..803808152 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -204,7 +204,7 @@ PROFCODE EMMS - + movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp @@ -256,7 +256,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -275,7 +275,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -291,7 +291,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 4), B leal (BB, %eax, 8), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -307,7 +307,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: #define COPYPREFETCH 40 @@ -373,7 +373,7 @@ addl $4 * SIZE, B ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movl A, AA @@ -414,7 +414,7 @@ movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -787,7 +787,7 @@ movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -824,7 +824,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -1266,7 +1266,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -1282,7 +1282,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1298,7 +1298,7 @@ sarl $2, %eax jle .L35 ALIGN_4 - + .L32: #define COPYPREFETCH 40 @@ -1363,7 +1363,7 @@ decl %eax jne .L36 ALIGN_4 - + .L40: #if defined(LT) || defined(RN) movl A, AA @@ -1404,7 +1404,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1665,7 +1665,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1987,7 +1987,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -2003,7 +2003,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2019,7 +2019,7 @@ sarl $3, %eax jle .L65 ALIGN_4 - + .L62: #define COPYPREFETCH 40 @@ -2081,7 +2081,7 @@ decl %eax jne .L66 ALIGN_4 - + .L70: #if defined(LT) || defined(RN) movl A, AA @@ -2120,7 +2120,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2326,7 +2326,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse3.S b/kernel/x86/trsm_kernel_LN_2x4_sse3.S index 5ab4ab3db..5b4c19ba0 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse3.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -95,7 +95,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -158,7 +158,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -187,7 +187,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -534,7 +534,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -982,7 +982,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1011,7 +1011,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1255,7 +1255,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1565,7 +1565,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1594,7 +1594,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 @@ -1797,7 +1797,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 diff --git a/kernel/x86/trsm_kernel_LN_4x2_core2.S b/kernel/x86/trsm_kernel_LN_4x2_core2.S index d974fa659..94942b69f 100644 --- a/kernel/x86/trsm_kernel_LN_4x2_core2.S +++ b/kernel/x86/trsm_kernel_LN_4x2_core2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -55,7 +55,7 @@ #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) - + #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) @@ -141,7 +141,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -154,14 +154,14 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -177,7 +177,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -193,7 +193,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -243,7 +243,7 @@ decl %eax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movl A, AA @@ -285,7 +285,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -305,7 +305,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 @@ -517,7 +517,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -537,7 +537,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 @@ -789,7 +789,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1205,7 +1205,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L99: #ifdef LN @@ -1238,13 +1238,13 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -1260,7 +1260,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1276,7 +1276,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -1324,7 +1324,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movl A, AA @@ -1364,7 +1364,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1382,7 +1382,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 @@ -1549,7 +1549,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1567,7 +1567,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 @@ -1713,7 +1713,7 @@ movddup %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 - + movapd %xmm1, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) #else @@ -1773,7 +1773,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1793,7 +1793,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 @@ -2059,7 +2059,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L159: #ifdef LN diff --git a/kernel/x86/trsm_kernel_LN_4x2_sse2.S b/kernel/x86/trsm_kernel_LN_4x2_sse2.S index a1fb8a199..12625cce4 100644 --- a/kernel/x86/trsm_kernel_LN_4x2_sse2.S +++ b/kernel/x86/trsm_kernel_LN_4x2_sse2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -55,7 +55,7 @@ #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) - + #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) @@ -257,7 +257,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -270,14 +270,14 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -293,7 +293,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -309,7 +309,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -373,7 +373,7 @@ decl %eax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movl A, AA @@ -415,7 +415,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -435,7 +435,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 @@ -648,7 +648,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -668,7 +668,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 @@ -938,7 +938,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -956,7 +956,7 @@ prefetcht2 4 * SIZE(%esi) prefetcht2 4 * SIZE(%esi, LDC) #endif - + #if defined(LT) || defined(RN) movl KK, %eax #else @@ -969,7 +969,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -1062,7 +1062,7 @@ subl $64 * 8, %eax BRANCH jg .L1X - + .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB @@ -1071,7 +1071,7 @@ sarl $3, %eax je .L12 -.L11: +.L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -1117,7 +1117,7 @@ addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 - ALIGN_4 + ALIGN_4 .L14: #if defined(LN) || defined(RT) @@ -1382,7 +1382,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L99: #ifdef LN @@ -1415,14 +1415,14 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1438,7 +1438,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1454,7 +1454,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -1516,7 +1516,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movl A, AA @@ -1571,7 +1571,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) || defined(RN) movl KK, %eax @@ -1582,7 +1582,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 @@ -1752,7 +1752,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) || defined(RN) movl KK, %eax @@ -1763,7 +1763,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 @@ -1965,7 +1965,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -1985,7 +1985,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 @@ -2252,7 +2252,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L159: #ifdef LN diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 03f8e3d79..e98854f34 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -100,7 +100,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -165,7 +165,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -193,7 +193,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 @@ -499,7 +499,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 @@ -880,7 +880,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif leal (CO1, LDC, 2), %eax @@ -1451,7 +1451,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1479,7 +1479,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 @@ -1711,7 +1711,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 @@ -1978,7 +1978,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -2382,7 +2382,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2410,7 +2410,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 @@ -2575,7 +2575,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 @@ -2806,7 +2806,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 5259e11df..95bfb8e3b 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -268,7 +268,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -286,7 +286,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -302,7 +302,7 @@ sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -318,7 +318,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -373,7 +373,7 @@ addl $4 * SIZE, B ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movl A, AA @@ -413,7 +413,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -803,7 +803,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif #ifdef movsd xorps %xmm0, %xmm0 @@ -1257,7 +1257,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -1693,7 +1693,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1709,7 +1709,7 @@ sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1725,7 +1725,7 @@ sarl $2, %eax jle .L45 ALIGN_4 - + .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -1784,7 +1784,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movl A, AA @@ -1824,7 +1824,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2080,7 +2080,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2402,7 +2402,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2802,7 +2802,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -2818,7 +2818,7 @@ sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2834,7 +2834,7 @@ sarl $3, %eax jle .L85 ALIGN_4 - + .L82: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -2890,7 +2890,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movl A, AA @@ -2928,7 +2928,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -3118,7 +3118,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -3363,7 +3363,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 diff --git a/kernel/x86/trsm_kernel_LN_8x2_sse.S b/kernel/x86/trsm_kernel_LN_8x2_sse.S index 16a2c2f5b..12b09e142 100644 --- a/kernel/x86/trsm_kernel_LN_8x2_sse.S +++ b/kernel/x86/trsm_kernel_LN_8x2_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -153,7 +153,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -173,13 +173,13 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -195,7 +195,7 @@ sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -211,7 +211,7 @@ sarl $2, %eax jle .L03 ALIGN_4 - + .L02: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -337,7 +337,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -576,7 +576,7 @@ sall $BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L30: testl $2, M @@ -601,7 +601,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -801,7 +801,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 @@ -813,14 +813,14 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 @@ -833,7 +833,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 #endif @@ -911,7 +911,7 @@ sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L50: testl $4, M @@ -936,7 +936,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1186,7 +1186,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 @@ -1198,14 +1198,14 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 @@ -1218,7 +1218,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 #endif @@ -1323,7 +1323,7 @@ sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L70: movl M, %ebx @@ -1351,7 +1351,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1874,7 +1874,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 @@ -1890,7 +1890,7 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 #endif @@ -1898,7 +1898,7 @@ #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 @@ -1914,7 +1914,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif @@ -2079,7 +2079,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L99: #ifdef LN @@ -2110,12 +2110,12 @@ .L100: testl $1, N jle .L999 - + #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -2131,7 +2131,7 @@ sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2147,7 +2147,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -2262,7 +2262,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -2447,7 +2447,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -2667,7 +2667,7 @@ sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L150: testl $4, M @@ -2692,7 +2692,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -2996,7 +2996,7 @@ sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L170: movl M, %ebx @@ -3024,7 +3024,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -3462,7 +3462,7 @@ #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif @@ -3470,7 +3470,7 @@ #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 shufps $0x88, %xmm7, %xmm5 - + movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) @@ -3570,7 +3570,7 @@ decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L179: #ifdef LN diff --git a/kernel/x86/trsm_kernel_LT_1x4.S b/kernel/x86/trsm_kernel_LT_1x4.S index 5670746ec..5210f8575 100644 --- a/kernel/x86/trsm_kernel_LT_1x4.S +++ b/kernel/x86/trsm_kernel_LT_1x4.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 32 - + #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) @@ -111,7 +111,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl N, %eax @@ -167,7 +167,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -186,7 +186,7 @@ jle .L13 ALIGN_4 -.L12: +.L12: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -220,7 +220,7 @@ leal (B_ORIG, %eax, 4), B #else movl B_ORIG, B -#endif +#endif leal (%edi, LDC, 2), %eax @@ -679,7 +679,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -698,7 +698,7 @@ jle .L23 ALIGN_4 -.L22: +.L22: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -728,7 +728,7 @@ leal (B_ORIG, %eax, 2), B #else movl B_ORIG, B -#endif +#endif fldz fldz @@ -1022,7 +1022,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1041,7 +1041,7 @@ jle .L33 ALIGN_4 -.L32: +.L32: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -1071,7 +1071,7 @@ leal (B_ORIG, %eax, 1), B #else movl B_ORIG, B -#endif +#endif fldz fldz diff --git a/kernel/x86/trsm_kernel_LT_2x2.S b/kernel/x86/trsm_kernel_LT_2x2.S index d21909d66..ff29a3b2c 100644 --- a/kernel/x86/trsm_kernel_LT_2x2.S +++ b/kernel/x86/trsm_kernel_LT_2x2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -72,7 +72,7 @@ #else #define REP rep #endif - + #define AA %edx #define BB %ecx @@ -114,7 +114,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl N, %eax @@ -123,7 +123,7 @@ #endif movl N, %eax # j = (n >> 1) # MEMORY - sarl $1, %eax + sarl $1, %eax movl %eax, J # j = (n >> 1) # MEMORY je .L8 ALIGN_4 @@ -155,7 +155,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -182,7 +182,7 @@ leal (%ebx, %eax, 2), BB #else movl %ebx, BB -#endif +#endif fldz fldz @@ -535,7 +535,7 @@ leal (%ebx, %eax, 2), BB #else movl %ebx, BB -#endif +#endif fldz fldz @@ -733,7 +733,7 @@ movl N, %eax # n # MEMORY andl $1, %eax je .End - + #if defined(LT) || defined(RN) movl A, AA #else @@ -759,7 +759,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -770,7 +770,7 @@ sarl $1, %esi # m >> 1 je .L36 ALIGN_4 - + .L46: #ifdef LN movl K, %eax @@ -786,7 +786,7 @@ leal (%ebx, %eax, 1), BB #else movl %ebx, BB -#endif +#endif fldz fldz @@ -981,7 +981,7 @@ leal (%ebx, %eax, 1), BB #else movl %ebx, BB -#endif +#endif fldz diff --git a/kernel/x86/trsm_kernel_LT_2x2_atom.S b/kernel/x86/trsm_kernel_LT_2x2_atom.S index 383500531..139e41291 100644 --- a/kernel/x86/trsm_kernel_LT_2x2_atom.S +++ b/kernel/x86/trsm_kernel_LT_2x2_atom.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -83,7 +83,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -146,7 +146,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -178,7 +178,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -478,7 +478,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -724,7 +724,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -756,7 +756,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 @@ -971,7 +971,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 65a6cf091..086852cfc 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -95,7 +95,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -161,7 +161,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -193,7 +193,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif leal (CO1, LDC, 2), %eax @@ -647,7 +647,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1005,7 +1005,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1037,7 +1037,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1344,7 +1344,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1608,7 +1608,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1640,7 +1640,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1881,7 +1881,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index ba03221c2..01ff86c91 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -256,7 +256,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -275,7 +275,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -291,7 +291,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 4), B leal (BB, %eax, 8), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -307,7 +307,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: #define COPYPREFETCH 40 @@ -373,7 +373,7 @@ addl $4 * SIZE, B ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movl A, AA @@ -417,7 +417,7 @@ movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -454,7 +454,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -885,7 +885,7 @@ movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1266,7 +1266,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -1282,7 +1282,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1298,7 +1298,7 @@ sarl $2, %eax jle .L35 ALIGN_4 - + .L32: #define COPYPREFETCH 40 @@ -1363,7 +1363,7 @@ decl %eax jne .L36 ALIGN_4 - + .L40: #if defined(LT) || defined(RN) movl A, AA @@ -1407,7 +1407,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1721,7 +1721,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1987,7 +1987,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -2003,7 +2003,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2019,7 +2019,7 @@ sarl $3, %eax jle .L65 ALIGN_4 - + .L62: #define COPYPREFETCH 40 @@ -2081,7 +2081,7 @@ decl %eax jne .L66 ALIGN_4 - + .L70: #if defined(LT) || defined(RN) movl A, AA @@ -2123,7 +2123,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2369,7 +2369,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2573,7 +2573,7 @@ .L999: movl OLD_STACK, %esp EMMS - + popl %ebx popl %esi popl %edi diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse3.S b/kernel/x86/trsm_kernel_LT_2x4_sse3.S index 487f05922..b27616683 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse3.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -95,7 +95,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -158,7 +158,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -190,7 +190,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -607,7 +607,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -982,7 +982,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1014,7 +1014,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1298,7 +1298,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1565,7 +1565,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1597,7 +1597,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1823,7 +1823,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_LT_4x2_core2.S b/kernel/x86/trsm_kernel_LT_4x2_core2.S index dba627f00..1c08745d5 100644 --- a/kernel/x86/trsm_kernel_LT_4x2_core2.S +++ b/kernel/x86/trsm_kernel_LT_4x2_core2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -55,7 +55,7 @@ #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) - + #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) @@ -141,7 +141,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -154,14 +154,14 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -177,7 +177,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -193,7 +193,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -243,7 +243,7 @@ decl %eax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movl A, AA @@ -287,7 +287,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -703,7 +703,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: movl M, %ebx @@ -729,7 +729,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -749,7 +749,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 @@ -999,7 +999,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1019,7 +1019,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 @@ -1238,13 +1238,13 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -1260,7 +1260,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1276,7 +1276,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -1324,7 +1324,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movl A, AA @@ -1366,7 +1366,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1386,7 +1386,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 @@ -1652,7 +1652,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -1678,7 +1678,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1696,7 +1696,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 @@ -1842,7 +1842,7 @@ movddup %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 - + movapd %xmm1, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) #else @@ -1900,7 +1900,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1918,7 +1918,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 diff --git a/kernel/x86/trsm_kernel_LT_4x2_sse2.S b/kernel/x86/trsm_kernel_LT_4x2_sse2.S index 626d75a9b..dd21b3efb 100644 --- a/kernel/x86/trsm_kernel_LT_4x2_sse2.S +++ b/kernel/x86/trsm_kernel_LT_4x2_sse2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -55,7 +55,7 @@ #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) - + #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) @@ -256,7 +256,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -269,14 +269,14 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -292,7 +292,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -308,7 +308,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -372,7 +372,7 @@ decl %eax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movl A, AA @@ -416,7 +416,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -442,7 +442,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -535,7 +535,7 @@ subl $64 * 8, %eax BRANCH jg .L1X - + .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB @@ -544,7 +544,7 @@ sarl $3, %eax je .L12 -.L11: +.L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -589,7 +589,7 @@ addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 - ALIGN_4 + ALIGN_4 .L14: #if defined(LN) || defined(RT) @@ -854,7 +854,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: movl M, %ebx @@ -880,7 +880,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -900,7 +900,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 @@ -1168,7 +1168,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -1188,7 +1188,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 @@ -1408,14 +1408,14 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1431,7 +1431,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1447,7 +1447,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -1509,7 +1509,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movl A, AA @@ -1551,7 +1551,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -1571,7 +1571,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 @@ -1838,7 +1838,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -1873,7 +1873,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) || defined(RN) movl KK, %eax @@ -1884,7 +1884,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 @@ -2093,7 +2093,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) || defined(RN) movl KK, %eax @@ -2104,7 +2104,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index d27880b8d..2dd8ad08b 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -100,7 +100,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -165,7 +165,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -197,7 +197,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif leal (CO1, LDC, 2), %eax @@ -737,7 +737,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 @@ -1114,7 +1114,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 @@ -1451,7 +1451,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1483,7 +1483,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1861,7 +1861,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 @@ -2124,7 +2124,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 @@ -2382,7 +2382,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2414,7 +2414,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -2728,7 +2728,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 @@ -2955,7 +2955,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 4f7f33035..d54dcf26f 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -268,7 +268,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -286,7 +286,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -302,7 +302,7 @@ sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -318,7 +318,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -373,7 +373,7 @@ addl $4 * SIZE, B ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movl A, AA @@ -417,7 +417,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -842,7 +842,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif #ifdef movsd xorps %xmm0, %xmm0 @@ -1292,7 +1292,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -1693,7 +1693,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1709,7 +1709,7 @@ sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1725,7 +1725,7 @@ sarl $2, %eax jle .L45 ALIGN_4 - + .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -1784,7 +1784,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movl A, AA @@ -1828,7 +1828,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2220,7 +2220,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2538,7 +2538,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2801,7 +2801,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -2817,7 +2817,7 @@ sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2833,7 +2833,7 @@ sarl $3, %eax jle .L85 ALIGN_4 - + .L82: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -2889,7 +2889,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movl A, AA @@ -2931,7 +2931,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -3250,7 +3250,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -3491,7 +3491,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 diff --git a/kernel/x86/trsm_kernel_LT_8x2_sse.S b/kernel/x86/trsm_kernel_LT_8x2_sse.S index 5d596980f..b184f78bf 100644 --- a/kernel/x86/trsm_kernel_LT_8x2_sse.S +++ b/kernel/x86/trsm_kernel_LT_8x2_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -153,7 +153,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -173,13 +173,13 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -195,7 +195,7 @@ sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -211,7 +211,7 @@ sarl $2, %eax jle .L03 ALIGN_4 - + .L02: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -340,7 +340,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -863,7 +863,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 @@ -879,7 +879,7 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 #endif @@ -887,7 +887,7 @@ #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 @@ -903,7 +903,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif @@ -1068,7 +1068,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: testl $4, M @@ -1093,7 +1093,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1343,7 +1343,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 @@ -1355,14 +1355,14 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 @@ -1375,7 +1375,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 #endif @@ -1480,7 +1480,7 @@ sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L50: testl $2, M @@ -1505,7 +1505,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1705,7 +1705,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 @@ -1717,14 +1717,14 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 @@ -1737,7 +1737,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 #endif @@ -1815,7 +1815,7 @@ sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L70: testl $1, M @@ -1840,7 +1840,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -2079,7 +2079,7 @@ sall $BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L99: #ifdef LN @@ -2110,12 +2110,12 @@ .L100: testl $1, N jle .L999 - + #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -2131,7 +2131,7 @@ sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2147,7 +2147,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -2266,7 +2266,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -2704,7 +2704,7 @@ #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif @@ -2712,7 +2712,7 @@ #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 shufps $0x88, %xmm7, %xmm5 - + movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) @@ -2812,7 +2812,7 @@ decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: testl $4, M @@ -2837,7 +2837,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -3141,7 +3141,7 @@ sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L150: testl $2, M @@ -3166,7 +3166,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -3386,7 +3386,7 @@ sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L170: testl $1, M @@ -3410,7 +3410,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 diff --git a/kernel/x86/trsm_kernel_RT_1x4.S b/kernel/x86/trsm_kernel_RT_1x4.S index b7f17e259..09cb00c3d 100644 --- a/kernel/x86/trsm_kernel_RT_1x4.S +++ b/kernel/x86/trsm_kernel_RT_1x4.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 32 - + #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) @@ -111,7 +111,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl N, %eax @@ -165,7 +165,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -184,7 +184,7 @@ jle .L33 ALIGN_4 -.L32: +.L32: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -214,7 +214,7 @@ leal (B_ORIG, %eax, 1), B #else movl B_ORIG, B -#endif +#endif fldz fldz @@ -414,7 +414,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -433,7 +433,7 @@ jle .L23 ALIGN_4 -.L22: +.L22: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -463,7 +463,7 @@ leal (B_ORIG, %eax, 2), B #else movl B_ORIG, B -#endif +#endif fldz fldz @@ -759,7 +759,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -778,7 +778,7 @@ jle .L13 ALIGN_4 -.L12: +.L12: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -812,7 +812,7 @@ leal (B_ORIG, %eax, 4), B #else movl B_ORIG, B -#endif +#endif leal (%edi, LDC, 2), %eax diff --git a/kernel/x86/trsm_kernel_RT_2x2.S b/kernel/x86/trsm_kernel_RT_2x2.S index 860344616..8288d8371 100644 --- a/kernel/x86/trsm_kernel_RT_2x2.S +++ b/kernel/x86/trsm_kernel_RT_2x2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -72,7 +72,7 @@ #else #define REP rep #endif - + #define AA %edx #define BB %ecx @@ -112,7 +112,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -123,7 +123,7 @@ movl N, %eax # n # MEMORY andl $1, %eax je .L8 - + #if defined(LT) || defined(RN) movl A, AA #else @@ -149,7 +149,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -160,7 +160,7 @@ sarl $1, %esi # m >> 1 je .L36 ALIGN_4 - + .L46: #ifdef LN movl K, %eax @@ -176,7 +176,7 @@ leal (%ebx, %eax, 1), BB #else movl %ebx, BB -#endif +#endif fldz fldz @@ -371,7 +371,7 @@ leal (%ebx, %eax, 1), BB #else movl %ebx, BB -#endif +#endif fldz @@ -485,7 +485,7 @@ .L8: movl N, %eax # j = (n >> 1) # MEMORY - sarl $1, %eax + sarl $1, %eax movl %eax, J # j = (n >> 1) # MEMORY je .End ALIGN_4 @@ -517,7 +517,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -544,7 +544,7 @@ leal (%ebx, %eax, 2), BB #else movl %ebx, BB -#endif +#endif fldz fldz @@ -897,7 +897,7 @@ leal (%ebx, %eax, 2), BB #else movl %ebx, BB -#endif +#endif fldz fldz diff --git a/kernel/x86/trsm_kernel_RT_2x2_atom.S b/kernel/x86/trsm_kernel_RT_2x2_atom.S index 97af198f9..b3eaf5693 100644 --- a/kernel/x86/trsm_kernel_RT_2x2_atom.S +++ b/kernel/x86/trsm_kernel_RT_2x2_atom.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -83,7 +83,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -140,7 +140,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -172,7 +172,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 @@ -387,7 +387,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -584,7 +584,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -616,7 +616,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -916,7 +916,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index ff8231e16..154276f6a 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -95,7 +95,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -155,7 +155,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -187,7 +187,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 @@ -430,7 +430,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 movhps -15 * SIZE(AA), %xmm0 @@ -610,7 +610,7 @@ #endif ALIGN_4 -.L30: +.L30: testl $2, N je .L60 @@ -641,7 +641,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -673,7 +673,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -980,7 +980,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1250,7 +1250,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1282,7 +1282,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif leal (CO1, LDC, 2), %eax @@ -1736,7 +1736,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index b6d9ca4de..c43a0f18b 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -256,7 +256,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -271,7 +271,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -287,7 +287,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -303,7 +303,7 @@ sarl $3, %eax jle .L65 ALIGN_4 - + .L62: #define COPYPREFETCH 40 @@ -365,7 +365,7 @@ decl %eax jne .L66 ALIGN_4 - + .L70: #if defined(LT) || defined(RN) movl A, AA @@ -407,7 +407,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -653,7 +653,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -863,7 +863,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -879,7 +879,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -895,7 +895,7 @@ sarl $2, %eax jle .L35 ALIGN_4 - + .L32: #define COPYPREFETCH 40 @@ -960,7 +960,7 @@ decl %eax jne .L36 ALIGN_4 - + .L40: #if defined(LT) || defined(RN) movl A, AA @@ -1004,7 +1004,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1318,7 +1318,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1588,7 +1588,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -1604,7 +1604,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 4), B leal (BB, %eax, 8), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1620,7 +1620,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: #define COPYPREFETCH 40 @@ -1686,7 +1686,7 @@ addl $4 * SIZE, B ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movl A, AA @@ -1730,7 +1730,7 @@ movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1767,7 +1767,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -2199,7 +2199,7 @@ movl KK, %eax sall $3 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2576,7 +2576,7 @@ .L999: movl OLD_STACK, %esp EMMS - + popl %ebx popl %esi popl %edi diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse3.S b/kernel/x86/trsm_kernel_RT_2x4_sse3.S index 6be1d8643..792c32729 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse3.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -95,7 +95,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -152,7 +152,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -184,7 +184,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -410,7 +410,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 movhpd 1 * SIZE(AA), %xmm0 @@ -636,7 +636,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -668,7 +668,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -952,7 +952,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1225,7 +1225,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1257,7 +1257,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1674,7 +1674,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 diff --git a/kernel/x86/trsm_kernel_RT_4x2_core2.S b/kernel/x86/trsm_kernel_RT_4x2_core2.S index 866eddf36..781876b7e 100644 --- a/kernel/x86/trsm_kernel_RT_4x2_core2.S +++ b/kernel/x86/trsm_kernel_RT_4x2_core2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -55,7 +55,7 @@ #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) - + #define K 16(%esp) #define N 20(%esp) #define M 24(%esp) @@ -141,7 +141,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -153,13 +153,13 @@ testl $1, %eax jle .L100 ALIGN_2 - + .L101: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -175,7 +175,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -191,7 +191,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -239,7 +239,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movl A, AA @@ -281,7 +281,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -301,7 +301,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm1, %xmm0 mulpd -14 * SIZE(AA), %xmm1 addpd %xmm0, %xmm4 @@ -567,7 +567,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -593,7 +593,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -611,7 +611,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm1 movapd -14 * SIZE(AA), %xmm0 addpd %xmm1, %xmm4 @@ -757,7 +757,7 @@ movddup %xmm0, %xmm1 unpckhpd %xmm0, %xmm0 - + movapd %xmm1, -16 * SIZE(BB) movapd %xmm0, -14 * SIZE(BB) #else @@ -815,7 +815,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -833,7 +833,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm1 movsd -15 * SIZE(AA), %xmm0 addsd %xmm1, %xmm4 @@ -1005,14 +1005,14 @@ movl %eax, J jle .L999 ALIGN_2 - + .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -1028,7 +1028,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1044,7 +1044,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -1094,7 +1094,7 @@ decl %eax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movl A, AA @@ -1138,7 +1138,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1554,7 +1554,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: movl M, %ebx @@ -1580,7 +1580,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1600,7 +1600,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm1 mulpd -14 * SIZE(BB), %xmm0 addpd %xmm1, %xmm4 @@ -1850,7 +1850,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1870,7 +1870,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm1 mulsd -14 * SIZE(BB), %xmm0 addsd %xmm1, %xmm4 diff --git a/kernel/x86/trsm_kernel_RT_4x2_sse2.S b/kernel/x86/trsm_kernel_RT_4x2_sse2.S index 68b52ba52..6c3b3427b 100644 --- a/kernel/x86/trsm_kernel_RT_4x2_sse2.S +++ b/kernel/x86/trsm_kernel_RT_4x2_sse2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -55,7 +55,7 @@ #define STACK_C 32 + STACK + ARGS(%esi) #define STACK_LDC 36 + STACK + ARGS(%esi) #define STACK_OFFT 40 + STACK + ARGS(%esi) - + #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) @@ -216,7 +216,7 @@ addl $STACK_OFFSET, %esp STACK_TOUCHING - + movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 @@ -257,7 +257,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -269,14 +269,14 @@ testl $1, %eax jle .L100 ALIGN_2 - + .L101: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -292,7 +292,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 1), B leal (BB, %eax, 2), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -308,7 +308,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -370,7 +370,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movl A, AA @@ -412,7 +412,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -432,7 +432,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 @@ -699,7 +699,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -734,7 +734,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) || defined(RN) movl KK, %eax @@ -745,7 +745,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 @@ -954,7 +954,7 @@ movl KK, %eax sall $0 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) || defined(RN) movl KK, %eax @@ -965,7 +965,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 @@ -1131,14 +1131,14 @@ movl %eax, J jle .L999 ALIGN_2 - + .L01: /* Copying to Sub Buffer */ #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1154,7 +1154,7 @@ leal (, %eax, SIZE), %eax leal (B, %eax, 2), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1170,7 +1170,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -1234,7 +1234,7 @@ decl %eax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movl A, AA @@ -1278,7 +1278,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -1304,7 +1304,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -1406,7 +1406,7 @@ sarl $3, %eax je .L12 -.L11: +.L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -1452,7 +1452,7 @@ addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 - ALIGN_4 + ALIGN_4 .L14: #if defined(LN) || defined(RT) @@ -1717,7 +1717,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: movl M, %ebx @@ -1743,7 +1743,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -1763,7 +1763,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 @@ -2031,7 +2031,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movsd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -2051,7 +2051,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 857866552..acdcd6e22 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -100,7 +100,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK leal (, LDC, SIZE), LDC @@ -160,7 +160,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -192,7 +192,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -506,7 +506,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 @@ -733,7 +733,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 @@ -925,7 +925,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -957,7 +957,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1335,7 +1335,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm3, %xmm3 @@ -1598,7 +1598,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 @@ -1861,7 +1861,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1893,7 +1893,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif leal (CO1, LDC, 2), %eax @@ -2433,7 +2433,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movaps -32 * SIZE(AA), %xmm0 @@ -2810,7 +2810,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax addl %eax, BB -#endif +#endif pxor %xmm4, %xmm4 movsd -32 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 40afac547..743516e5e 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -268,7 +268,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -283,7 +283,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -299,7 +299,7 @@ sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -315,7 +315,7 @@ sarl $3, %eax jle .L85 ALIGN_4 - + .L82: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -371,7 +371,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movl A, AA @@ -413,7 +413,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -728,7 +728,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -969,7 +969,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1165,7 +1165,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1181,7 +1181,7 @@ sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1197,7 +1197,7 @@ sarl $2, %eax jle .L45 ALIGN_4 - + .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -1253,7 +1253,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movl A, AA @@ -1297,7 +1297,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1689,7 +1689,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2007,7 +2007,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -2273,7 +2273,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -2289,7 +2289,7 @@ sall $2 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -2305,7 +2305,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -2360,7 +2360,7 @@ addl $4 * SIZE, B ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movl A, AA @@ -2404,7 +2404,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -2829,7 +2829,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif #ifdef movsd xorps %xmm0, %xmm0 @@ -3279,7 +3279,7 @@ movl KK, %eax sall $2 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 diff --git a/kernel/x86/trsm_kernel_RT_8x2_sse.S b/kernel/x86/trsm_kernel_RT_8x2_sse.S index 6bc1d21dc..cea034ebf 100644 --- a/kernel/x86/trsm_kernel_RT_8x2_sse.S +++ b/kernel/x86/trsm_kernel_RT_8x2_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -153,7 +153,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -170,12 +170,12 @@ testl $1, N jle .L100 - + #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -191,7 +191,7 @@ sall $BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -207,7 +207,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -326,7 +326,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -764,7 +764,7 @@ #if defined(RN) || defined(RT) movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif @@ -772,7 +772,7 @@ #if defined(LN) || defined(LT) shufps $0x88, %xmm3, %xmm2 shufps $0x88, %xmm7, %xmm5 - + movlps %xmm2, 0 * SIZE(B) movhps %xmm2, 2 * SIZE(B) movlps %xmm5, 4 * SIZE(B) @@ -872,7 +872,7 @@ decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: testl $4, M @@ -897,7 +897,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1201,7 +1201,7 @@ sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L150: testl $2, M @@ -1226,7 +1226,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1446,7 +1446,7 @@ sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L170: testl $1, M @@ -1470,7 +1470,7 @@ movl KK, %eax sall $BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1658,13 +1658,13 @@ movl %eax, J jle .L999 ALIGN_2 - + .L01: #ifdef LN movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -1680,7 +1680,7 @@ sall $1 + BASE_SHIFT, %eax leal (B, %eax, 1), B leal (BB, %eax, 4), BB -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1696,7 +1696,7 @@ sarl $2, %eax jle .L03 ALIGN_4 - + .L02: movsd 0 * SIZE(B), %xmm3 movhps 2 * SIZE(B), %xmm3 @@ -1825,7 +1825,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -2348,7 +2348,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 @@ -2364,7 +2364,7 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 #endif @@ -2372,7 +2372,7 @@ #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 mulps %xmm6, %xmm3 @@ -2388,7 +2388,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 mulps %xmm6, %xmm1 #endif @@ -2553,7 +2553,7 @@ decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: testl $4, M @@ -2578,7 +2578,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -2828,7 +2828,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 @@ -2840,14 +2840,14 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 @@ -2860,7 +2860,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 #endif @@ -2965,7 +2965,7 @@ sall $2 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L50: testl $2, M @@ -2990,7 +2990,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -3190,7 +3190,7 @@ #ifdef RN movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 movss 1 * SIZE(B), %xmm6 @@ -3202,14 +3202,14 @@ movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 #endif #ifdef RT movss 3 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm2 movss 2 * SIZE(B), %xmm6 @@ -3222,7 +3222,7 @@ movss 0 * SIZE(B), %xmm6 shufps $0x00, %xmm6, %xmm6 - + mulps %xmm6, %xmm0 #endif @@ -3300,7 +3300,7 @@ sall $1 + BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L70: testl $1, M @@ -3325,7 +3325,7 @@ movl KK, %eax sall $1 + BASE_SHIFT, %eax leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -3567,7 +3567,7 @@ sall $BASE_SHIFT, %eax addl %eax, AORIG #endif - ALIGN_2 + ALIGN_2 .L99: #ifdef LN diff --git a/kernel/x86/xaxpy.S b/kernel/x86/xaxpy.S index 554aa0c34..99eadab18 100644 --- a/kernel/x86/xaxpy.S +++ b/kernel/x86/xaxpy.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 32 + STACK + ARGS(%esp) @@ -87,7 +87,7 @@ sall $ZBASE_SHIFT, INCX sall $ZBASE_SHIFT, INCY - + testl M, M jle .L40 diff --git a/kernel/x86/xdot.S b/kernel/x86/xdot.S index 929763271..9f772060d 100644 --- a/kernel/x86/xdot.S +++ b/kernel/x86/xdot.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #if defined(F_INTERFACE) && defined(RETURN_BY_STACK) #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) diff --git a/kernel/x86/xgemm3m_kernel_2x2.S b/kernel/x86/xgemm3m_kernel_2x2.S index b844875f1..c53825d04 100644 --- a/kernel/x86/xgemm3m_kernel_2x2.S +++ b/kernel/x86/xgemm3m_kernel_2x2.S @@ -50,7 +50,7 @@ #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -90,13 +90,13 @@ negl %eax movl %eax, KK #endif - + movl ARG_LDC, LDC movl ARG_B, B addl $8 * SIZE, A addl $8 * SIZE, B - + sall $ZBASE_SHIFT, LDC movl N, %eax @@ -109,7 +109,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl A, AO @@ -132,7 +132,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO -#endif +#endif fldz fldz @@ -152,7 +152,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -178,7 +178,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -196,7 +196,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -216,7 +216,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -234,7 +234,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -270,7 +270,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -359,7 +359,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 2), BO -#endif +#endif fldz fldz @@ -369,7 +369,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -502,13 +502,13 @@ .L30: movl N, %eax - testl $1, %eax + testl $1, %eax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl A, AO @@ -530,7 +530,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 2), AO leal ( B, %eax, 1), BO -#endif +#endif fldz fldz @@ -546,7 +546,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -683,7 +683,7 @@ sall $BASE_SHIFT, %eax leal (AO, %eax, 1), AO leal ( B, %eax, 1), BO -#endif +#endif fldz @@ -692,7 +692,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/xgemm_kernel_1x1.S b/kernel/x86/xgemm_kernel_1x1.S index b401bd206..1e2c4a159 100644 --- a/kernel/x86/xgemm_kernel_1x1.S +++ b/kernel/x86/xgemm_kernel_1x1.S @@ -50,7 +50,7 @@ #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -113,18 +113,18 @@ negl %eax movl %eax, KK #endif - + movl ARG_LDC, LDC movl ARG_B, B addl $8 * SIZE, A addl $8 * SIZE, B - + sall $ZBASE_SHIFT, LDC cmpl $0, M jle .L999 - + movl N, %eax movl %eax, J testl %eax, %eax @@ -135,7 +135,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl A, AO @@ -155,7 +155,7 @@ sall $ZBASE_SHIFT, %eax leal (AO, %eax, 1), AO leal (B, %eax, 1), BO -#endif +#endif fldz fldz @@ -173,7 +173,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -199,7 +199,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -217,7 +217,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -237,7 +237,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -255,7 +255,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -291,7 +291,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -336,7 +336,7 @@ FST 1 * SIZE(CO) FST 0 * SIZE(CO) #endif - + #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movl K, %eax diff --git a/kernel/x86/xgemv_n.S b/kernel/x86/xgemv_n.S index 32447ba7e..1a96e8384 100644 --- a/kernel/x86/xgemv_n.S +++ b/kernel/x86/xgemv_n.S @@ -53,7 +53,7 @@ #define STACK 16 #define ARGS 16 - + #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) diff --git a/kernel/x86/xgemv_t.S b/kernel/x86/xgemv_t.S index 1397a10f2..a9c8dbc2b 100644 --- a/kernel/x86/xgemv_t.S +++ b/kernel/x86/xgemv_t.S @@ -49,7 +49,7 @@ #define STACK 16 #define ARGS 24 - + #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) diff --git a/kernel/x86/xtrsm_kernel_LT_1x1.S b/kernel/x86/xtrsm_kernel_LT_1x1.S index e05266f7c..2dcad5640 100644 --- a/kernel/x86/xtrsm_kernel_LT_1x1.S +++ b/kernel/x86/xtrsm_kernel_LT_1x1.S @@ -50,7 +50,7 @@ #define PREFETCHSIZE (5 + 4 * 10) #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -111,7 +111,7 @@ addl $8 * SIZE, A addl $8 * SIZE, B - + #ifdef LN movl M, %eax sall $ZBASE_SHIFT, %eax @@ -135,7 +135,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl N, %eax @@ -145,7 +145,7 @@ cmpl $0, M jle .L999 - + movl N, %eax movl %eax, J testl %eax, %eax @@ -178,7 +178,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -203,7 +203,7 @@ leal (B, %eax, 1), BO #else movl B, BO -#endif +#endif fldz fldz @@ -238,7 +238,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -256,7 +256,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -276,7 +276,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -294,7 +294,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -331,7 +331,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -350,7 +350,7 @@ .L18: faddp %st, %st(3) faddp %st, %st(1) - + fxch %st(1) #if defined(LN) || defined(RT) @@ -430,7 +430,7 @@ FST 0 * SIZE(CO) FST 1 * SIZE(CO) - + #ifndef LN addl $2 * SIZE, CO #endif diff --git a/kernel/x86/zamax.S b/kernel/x86/zamax.S index 3056c1e62..8af882341 100644 --- a/kernel/x86/zamax.S +++ b/kernel/x86/zamax.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -91,9 +91,9 @@ fstp %st(0) FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) addl INCX, X decl M @@ -106,43 +106,43 @@ sarl $2, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 2 * SIZE(X) - fabs + fabs FLD 3 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 4 * SIZE(X) - fabs + fabs FLD 5 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) fstp %st(1) FLD 6 * SIZE(X) - fabs + fabs FLD 7 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) @@ -163,9 +163,9 @@ .L21: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) @@ -182,12 +182,12 @@ sarl $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -196,9 +196,9 @@ FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -207,9 +207,9 @@ FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -218,9 +218,9 @@ FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addl INCX, X faddp %st, %st(1) fcomi %st(1), %st @@ -240,9 +240,9 @@ .L61: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi %st(1), %st FMOV %st(1), %st(0) diff --git a/kernel/x86/zamax_sse.S b/kernel/x86/zamax_sse.S index 60dd25b87..49e1c9c4b 100644 --- a/kernel/x86/zamax_sse.S +++ b/kernel/x86/zamax_sse.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -54,12 +54,12 @@ #define MM %ebp #define XX %edi #define TEMP %ebx - + #ifdef USE_MIN #define maxps minps #define maxss minss #endif - + #ifndef HAVE_SSE2 #define pxor xorps #define movsd movlps @@ -124,7 +124,7 @@ sarl $3, I jle .L35 ALIGN_4 - + .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -216,7 +216,7 @@ maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L37: testl $1, MM @@ -247,7 +247,7 @@ sarl $3, I jle .L75 ALIGN_4 - + .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -325,7 +325,7 @@ #endif addps %xmm3, %xmm1 maxps %xmm1, %xmm0 - ALIGN_3 + ALIGN_3 .L76: testl $2, MM @@ -349,7 +349,7 @@ maxss %xmm1, %xmm0 maxss %xmm3, %xmm0 ALIGN_3 - + .L77: testl $1, MM je .L80 diff --git a/kernel/x86/zamax_sse2.S b/kernel/x86/zamax_sse2.S index 50adffbec..83f5cb831 100644 --- a/kernel/x86/zamax_sse2.S +++ b/kernel/x86/zamax_sse2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -54,7 +54,7 @@ #define MM %ebp #define XX %edi #define TEMP %ebx - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd @@ -204,7 +204,7 @@ maxpd %xmm1, %xmm0 addl $4 * SIZE, XX - ALIGN_3 + ALIGN_3 .L27: testl $1, MM @@ -230,7 +230,7 @@ sarl $3, I jle .L65 ALIGN_4 - + .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) @@ -339,7 +339,7 @@ andpd %xmm7, %xmm2 addpd %xmm2, %xmm1 maxpd %xmm1, %xmm0 - ALIGN_3 + ALIGN_3 .L67: testl $1, MM diff --git a/kernel/x86/zasum.S b/kernel/x86/zasum.S index 84b8f60cf..136120561 100644 --- a/kernel/x86/zasum.S +++ b/kernel/x86/zasum.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -49,7 +49,7 @@ #define M %edx #define X %ecx #define INCX %esi - + #define I %eax #include "l1param.h" @@ -92,7 +92,7 @@ sarl $2, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -157,7 +157,7 @@ sarl $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) fabs diff --git a/kernel/x86/zasum_sse.S b/kernel/x86/zasum_sse.S index ff8230c51..dee096bc4 100644 --- a/kernel/x86/zasum_sse.S +++ b/kernel/x86/zasum_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 @@ -52,7 +52,7 @@ #define INCX %ebx #include "l1param.h" - + PROLOGUE PROFCODE @@ -79,7 +79,7 @@ movss STACK_M, %xmm3 shufps $0, %xmm3, %xmm3 #endif - + sall $ZBASE_SHIFT, INCX cmpl $2 * SIZE, INCX @@ -124,7 +124,7 @@ decl I jle .L12 ALIGN_3 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -277,7 +277,7 @@ sarl $2, I jle .L105 ALIGN_4 - + .L101: movsd (X), %xmm4 addl INCX, X @@ -322,18 +322,18 @@ #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 -#endif +#endif movss %xmm0, STACK_M flds STACK_M - + popl %ebx popl %esi ret diff --git a/kernel/x86/zasum_sse2.S b/kernel/x86/zasum_sse2.S index b7dbc1512..0c7349199 100644 --- a/kernel/x86/zasum_sse2.S +++ b/kernel/x86/zasum_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 @@ -111,7 +111,7 @@ decl I jle .L11 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -224,7 +224,7 @@ addpd %xmm5, %xmm1 addl $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L22: testl $2, M @@ -234,7 +234,7 @@ andps %xmm3, %xmm4 addpd %xmm4, %xmm0 addl $2 * SIZE, X - + .L23: testl $1, M je .L999 @@ -253,7 +253,7 @@ sarl $2, I jle .L60 ALIGN_4 - + .L50: movsd 0 * SIZE(X), %xmm4 movhps 1 * SIZE(X), %xmm4 diff --git a/kernel/x86/zaxpy.S b/kernel/x86/zaxpy.S index 0894f5dc6..b79ad795f 100644 --- a/kernel/x86/zaxpy.S +++ b/kernel/x86/zaxpy.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #ifdef DOUBLE #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) diff --git a/kernel/x86/zaxpy_sse.S b/kernel/x86/zaxpy_sse.S index 9c94cec44..3f67a0f72 100644 --- a/kernel/x86/zaxpy_sse.S +++ b/kernel/x86/zaxpy_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 20 + STACK + ARGS(%esp) @@ -97,7 +97,7 @@ movss STACK_M, %xmm5 shufps $0x11, %xmm5, %xmm5 #endif - + shufps $0, ALPHA_R, ALPHA_R shufps $0, ALPHA_I, ALPHA_I @@ -3125,7 +3125,7 @@ addps %xmm1, %xmm4 movsd %xmm4, (Y) - + decl %eax jg .L201 diff --git a/kernel/x86/zaxpy_sse2.S b/kernel/x86/zaxpy_sse2.S index 9c2caa7e8..db6001ccd 100644 --- a/kernel/x86/zaxpy_sse2.S +++ b/kernel/x86/zaxpy_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_ALPHA_R 16 + STACK + ARGS(%esp) #define STACK_ALPHA_I 24 + STACK + ARGS(%esp) @@ -113,10 +113,10 @@ #endif #ifndef CONJ - shufps $0x0c, %xmm5, %xmm5 + shufps $0x0c, %xmm5, %xmm5 xorpd %xmm5, ALPHA_I #else - shufps $0xc0, %xmm5, %xmm5 + shufps $0xc0, %xmm5, %xmm5 xorpd %xmm5, ALPHA_R #endif @@ -1518,7 +1518,7 @@ movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) - + decl %eax jg .L58 ALIGN_3 diff --git a/kernel/x86/zcopy.S b/kernel/x86/zcopy.S index 153853ea0..248baf48e 100644 --- a/kernel/x86/zcopy.S +++ b/kernel/x86/zcopy.S @@ -41,13 +41,13 @@ #define STACK 12 #define ARGS 0 - + #define M 4 + STACK + ARGS(%esp) #define X 8 + STACK + ARGS(%esp) #define INCX 12 + STACK + ARGS(%esp) #define Y 16 + STACK + ARGS(%esp) #define INCY 20 + STACK + ARGS(%esp) - + PROLOGUE pushl %edi @@ -71,7 +71,7 @@ sall $ZBASE_SHIFT, %esi sall $ZBASE_SHIFT, %edi - + cmpl $2 * SIZE, %esi # if incx != 1 jne .L100 cmpl $2 * SIZE, %edi # if incy != 1 @@ -84,14 +84,14 @@ .L11: #if defined(DOUBLE) || defined(XDOUBLE) - FLD 7 * SIZE(%ecx) - FLD 6 * SIZE(%ecx) - FLD 5 * SIZE(%ecx) - FLD 4 * SIZE(%ecx) - FLD 3 * SIZE(%ecx) - FLD 2 * SIZE(%ecx) - FLD 1 * SIZE(%ecx) - FLD 0 * SIZE(%ecx) + FLD 7 * SIZE(%ecx) + FLD 6 * SIZE(%ecx) + FLD 5 * SIZE(%ecx) + FLD 4 * SIZE(%ecx) + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) FST 0 * SIZE(%edx) FST 1 * SIZE(%edx) @@ -102,10 +102,10 @@ FST 6 * SIZE(%edx) FST 7 * SIZE(%edx) #else - fldl 6 * SIZE(%ecx) - fldl 4 * SIZE(%ecx) - fldl 2 * SIZE(%ecx) - fldl 0 * SIZE(%ecx) + fldl 6 * SIZE(%ecx) + fldl 4 * SIZE(%ecx) + fldl 2 * SIZE(%ecx) + fldl 0 * SIZE(%ecx) fstpl 0 * SIZE(%edx) fstpl 2 * SIZE(%edx) diff --git a/kernel/x86/zcopy_sse.S b/kernel/x86/zcopy_sse.S index 83930057a..23e740e1a 100644 --- a/kernel/x86/zcopy_sse.S +++ b/kernel/x86/zcopy_sse.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -89,7 +89,7 @@ subl $-32 * SIZE, X subl $-32 * SIZE, Y addl M, M - + testl $SIZE, Y je .L05 diff --git a/kernel/x86/zcopy_sse2.S b/kernel/x86/zcopy_sse2.S index f936a34a9..c31726f0d 100644 --- a/kernel/x86/zcopy_sse2.S +++ b/kernel/x86/zcopy_sse2.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) diff --git a/kernel/x86/zdot.S b/kernel/x86/zdot.S index 9d8866ad0..5a2a758c6 100644 --- a/kernel/x86/zdot.S +++ b/kernel/x86/zdot.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #if defined(DOUBLE) || defined(XDOUBLE) #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) @@ -100,9 +100,9 @@ addl INCY, INCY fldz - leal (, INCX, SIZE), INCX + leal (, INCX, SIZE), INCX fldz - leal (, INCY, SIZE), INCY + leal (, INCY, SIZE), INCY fldz cmpl $2 * SIZE, INCX diff --git a/kernel/x86/zdot_amd.S b/kernel/x86/zdot_amd.S index 97a1e721d..0a74c4766 100644 --- a/kernel/x86/zdot_amd.S +++ b/kernel/x86/zdot_amd.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #if !defined(DOUBLE) && !defined(XDOUBLE) #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) @@ -98,8 +98,8 @@ addl INCX, INCX addl INCY, INCY - leal (, INCX, SIZE), INCX - leal (, INCY, SIZE), INCY + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY cmpl $2 * SIZE, INCX jne .L14 diff --git a/kernel/x86/zdot_sse.S b/kernel/x86/zdot_sse.S index cc229643b..117574ea8 100644 --- a/kernel/x86/zdot_sse.S +++ b/kernel/x86/zdot_sse.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S index 61e1bfc27..d799e5d50 100644 --- a/kernel/x86/zdot_sse2.S +++ b/kernel/x86/zdot_sse2.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_X 12 + STACK + ARGS(%esp) @@ -119,7 +119,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 @@ -127,7 +127,7 @@ movaps -12 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 @@ -139,7 +139,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 @@ -147,7 +147,7 @@ movaps -8 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 @@ -159,7 +159,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 @@ -167,7 +167,7 @@ movaps -4 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 @@ -179,7 +179,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps 0 * SIZE(Y), %xmm6 @@ -187,7 +187,7 @@ movaps 0 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps 2 * SIZE(Y), %xmm7 @@ -203,7 +203,7 @@ ALIGN_3 .L12: - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 @@ -211,7 +211,7 @@ movaps -12 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 @@ -219,7 +219,7 @@ movaps -10 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 @@ -227,7 +227,7 @@ movaps -8 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 @@ -235,7 +235,7 @@ movaps -6 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 @@ -243,7 +243,7 @@ movaps -4 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 @@ -251,13 +251,13 @@ movaps -2 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -276,7 +276,7 @@ movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 @@ -284,7 +284,7 @@ movaps -12 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 @@ -292,13 +292,13 @@ movaps -10 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -317,13 +317,13 @@ movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -340,7 +340,7 @@ movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 @@ -370,7 +370,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 @@ -379,7 +379,7 @@ movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 @@ -392,7 +392,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 @@ -401,7 +401,7 @@ movhps -7 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 @@ -414,7 +414,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 @@ -423,7 +423,7 @@ movhps -3 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 @@ -436,7 +436,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps 0 * SIZE(Y), %xmm6 @@ -445,7 +445,7 @@ movhps 1 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps 2 * SIZE(Y), %xmm7 @@ -462,7 +462,7 @@ ALIGN_3 .L22: - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 @@ -471,7 +471,7 @@ movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 @@ -480,7 +480,7 @@ movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 @@ -489,7 +489,7 @@ movhps -7 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(Y), %xmm7 @@ -498,7 +498,7 @@ movhps -5 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(Y), %xmm6 @@ -507,7 +507,7 @@ movhps -3 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(Y), %xmm7 @@ -516,13 +516,13 @@ movhps -1 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -544,7 +544,7 @@ movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(Y), %xmm6 @@ -553,7 +553,7 @@ movhps -11 * SIZE(X), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(Y), %xmm7 @@ -562,13 +562,13 @@ movhps -9 * SIZE(X), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -586,7 +586,7 @@ movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 @@ -596,7 +596,7 @@ movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -614,7 +614,7 @@ movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 @@ -647,7 +647,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 @@ -656,7 +656,7 @@ movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 @@ -668,7 +668,7 @@ #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 @@ -677,7 +677,7 @@ movhps -7 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(X), %xmm7 @@ -690,7 +690,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 @@ -699,7 +699,7 @@ movhps -3 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(X), %xmm7 @@ -712,7 +712,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps 0 * SIZE(X), %xmm6 @@ -721,7 +721,7 @@ movhps 1 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps 2 * SIZE(X), %xmm7 @@ -738,7 +738,7 @@ ALIGN_3 .L32: - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 @@ -747,7 +747,7 @@ movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 @@ -756,7 +756,7 @@ movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -8 * SIZE(X), %xmm6 @@ -765,7 +765,7 @@ movhps -7 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -6 * SIZE(X), %xmm7 @@ -774,7 +774,7 @@ movhps -5 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -4 * SIZE(X), %xmm6 @@ -783,7 +783,7 @@ movhps -3 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -2 * SIZE(X), %xmm7 @@ -792,13 +792,13 @@ movhps -1 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -820,7 +820,7 @@ movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 movaps -12 * SIZE(X), %xmm6 @@ -829,7 +829,7 @@ movhps -11 * SIZE(Y), %xmm4 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 movaps -10 * SIZE(X), %xmm7 @@ -838,13 +838,13 @@ movhps -9 * SIZE(Y), %xmm5 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -862,7 +862,7 @@ movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 @@ -872,7 +872,7 @@ movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -893,7 +893,7 @@ movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 @@ -939,7 +939,7 @@ #endif movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -949,7 +949,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -959,7 +959,7 @@ addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -969,7 +969,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -979,7 +979,7 @@ addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -989,7 +989,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -999,7 +999,7 @@ addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1009,7 +1009,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1027,7 +1027,7 @@ .L42: movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1037,7 +1037,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1047,7 +1047,7 @@ addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1057,7 +1057,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1067,7 +1067,7 @@ addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1077,7 +1077,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1087,7 +1087,7 @@ addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1097,7 +1097,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1116,7 +1116,7 @@ movaps -16 * SIZE(Y), %xmm7 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1126,7 +1126,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1136,7 +1136,7 @@ addpd %xmm3, %xmm1 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1146,7 +1146,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1165,7 +1165,7 @@ movaps -16 * SIZE(Y), %xmm7 movsd %xmm7, %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 @@ -1175,7 +1175,7 @@ addpd %xmm3, %xmm1 movsd %xmm6, %xmm7 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 movsd %xmm4, %xmm5 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 @@ -1193,7 +1193,7 @@ movlpd -16 * SIZE(X), %xmm4 movlpd -16 * SIZE(Y), %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 @@ -1205,7 +1205,7 @@ SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm2, %xmm2 SHUFPD_1 %xmm3, %xmm3 - jmp .L98 + jmp .L98 ALIGN_3 .L50: @@ -1232,7 +1232,7 @@ ALIGN_3 .L53: - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1244,7 +1244,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1256,7 +1256,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1268,7 +1268,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1280,7 +1280,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1292,7 +1292,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1304,7 +1304,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1316,7 +1316,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1333,7 +1333,7 @@ ALIGN_3 .L54: - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1345,7 +1345,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1357,7 +1357,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1369,7 +1369,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1381,7 +1381,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1393,7 +1393,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1405,13 +1405,13 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -1436,7 +1436,7 @@ movhps 1 * SIZE(Y), %xmm7 addl INCY, Y - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 MOVLPS 0 * SIZE(Y), %xmm6 @@ -1448,7 +1448,7 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 MOVLPS 0 * SIZE(Y), %xmm7 @@ -1460,13 +1460,13 @@ addl INCX, X addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 addpd %xmm3, %xmm1 - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -1484,7 +1484,7 @@ movhps 1 * SIZE(Y), %xmm6 addl INCY, Y - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 @@ -1497,7 +1497,7 @@ movhps 1 * SIZE(Y), %xmm7 addl INCY, Y - pshufd $0x4e, %xmm7, %xmm3 + pshufd $0x4e, %xmm7, %xmm3 mulpd %xmm5, %xmm7 addpd %xmm7, %xmm0 mulpd %xmm5, %xmm3 @@ -1513,7 +1513,7 @@ MOVLPS 0 * SIZE(Y), %xmm6 movhps 1 * SIZE(Y), %xmm6 - pshufd $0x4e, %xmm6, %xmm3 + pshufd $0x4e, %xmm6, %xmm3 mulpd %xmm4, %xmm6 addpd %xmm6, %xmm0 mulpd %xmm4, %xmm3 diff --git a/kernel/x86/zgemm3m_kernel_1x4_athlon.S b/kernel/x86/zgemm3m_kernel_1x4_athlon.S index c57a8cb7a..4d84e5058 100644 --- a/kernel/x86/zgemm3m_kernel_1x4_athlon.S +++ b/kernel/x86/zgemm3m_kernel_1x4_athlon.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) @@ -132,7 +132,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl C, %edi @@ -152,7 +152,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -166,7 +166,7 @@ jle .L13 ALIGN_4 -.L12: +.L12: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -227,7 +227,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -533,7 +533,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl C, %edi @@ -553,7 +553,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -567,7 +567,7 @@ jle .L23 ALIGN_4 -.L22: +.L22: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -610,7 +610,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -774,7 +774,7 @@ #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C movl B, B_ORIG ALIGN_4 @@ -788,7 +788,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl C, %edi @@ -808,7 +808,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -822,7 +822,7 @@ jle .L33 ALIGN_4 -.L32: +.L32: movl -16 * SIZE(B), %esi movl -8 * SIZE(B), %esi movl 0 * SIZE(B), %esi @@ -861,7 +861,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -964,7 +964,7 @@ addl $1, KK #endif - addl LDC, C + addl LDC, C movl B, B_ORIG ALIGN_4 diff --git a/kernel/x86/zgemm3m_kernel_2x2_atom.S b/kernel/x86/zgemm3m_kernel_2x2_atom.S index ee918bfc0..51e948e15 100644 --- a/kernel/x86/zgemm3m_kernel_2x2_atom.S +++ b/kernel/x86/zgemm3m_kernel_2x2_atom.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -84,7 +84,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -100,7 +100,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 1, %eax @@ -129,7 +129,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB -#endif +#endif movl BX, %eax prefetcht0 0 * SIZE(%eax) @@ -151,7 +151,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -325,7 +325,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -338,7 +338,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -466,7 +466,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, CO1 addl LDC, C @@ -489,7 +489,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB -#endif +#endif movsd 0 * SIZE(BB), %xmm1 xorps %xmm0, %xmm0 @@ -503,7 +503,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -629,7 +629,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -641,7 +641,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_2x2_coppermine.S b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S index 674829f80..291dfa61c 100644 --- a/kernel/x86/zgemm3m_kernel_2x2_coppermine.S +++ b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) @@ -73,7 +73,7 @@ #else #define REP rep #endif - + PROLOGUE subl $ARGS, %esp # Generate Stack Frame @@ -90,14 +90,14 @@ negl %eax movl %eax, KK #endif - + movl N, %eax # j = (n >> 1) # MEMORY movl LDC, %ebp # ldc # MEMORY movl B, %ebx sall $ZBASE_SHIFT, %ebp - sarl $1, %eax + sarl $1, %eax leal 0(%ecx) , %ecx # NOP movl %eax, J # j = (n >> 1) # MEMORY @@ -109,7 +109,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl %ebx, BX @@ -130,7 +130,7 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 2), %ecx -#endif +#endif #ifdef HAVE_SSE movl BX, %eax @@ -167,7 +167,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -402,7 +402,7 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 2), %ecx -#endif +#endif fldz fldz @@ -413,7 +413,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -522,11 +522,11 @@ movl N, %eax # n # MEMORY andl $1, %eax je .End - + #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, %edi # c # MEMORY movl A, %edx # a # MEMORY @@ -535,7 +535,7 @@ sarl $1, %esi # m >> 1 je .L36 ALIGN_4 - + .L46: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ @@ -546,14 +546,14 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 2), %edx leal (%ebx, %eax, 1), %ecx -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -666,14 +666,14 @@ leal (, %eax, SIZE), %eax leal (%edx, %eax, 1), %edx leal (%ebx, %eax, 1), %ecx -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_2x4_barcelona.S b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S index 7822094e4..98d82ed6e 100644 --- a/kernel/x86/zgemm3m_kernel_2x4_barcelona.S +++ b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -202,7 +202,7 @@ #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -222,7 +222,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax movl %eax, BX @@ -246,7 +246,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -274,7 +274,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -517,7 +517,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -533,7 +533,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -695,7 +695,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, CO # coffset = c movl A, AO # aoffset = a @@ -716,7 +716,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -731,7 +731,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -898,7 +898,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -911,7 +911,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1030,7 +1030,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, CO # coffset = c movl A, AO # aoffset = a @@ -1051,7 +1051,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 1), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm4, %xmm4 @@ -1066,7 +1066,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1186,7 +1186,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 1), AO leal (B, %eax, 1), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -1199,7 +1199,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_2x4_opteron.S b/kernel/x86/zgemm3m_kernel_2x4_opteron.S index 8e93a28e8..30d8090ae 100644 --- a/kernel/x86/zgemm3m_kernel_2x4_opteron.S +++ b/kernel/x86/zgemm3m_kernel_2x4_opteron.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -237,7 +237,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -251,7 +251,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -259,7 +259,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: #define COPYPREFETCH 40 @@ -320,7 +320,7 @@ addl $4 * SIZE, %edi ALIGN_4 - + .L10: movl %edi, BX @@ -343,7 +343,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movl BX, %eax @@ -374,7 +374,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -389,7 +389,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -630,7 +630,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -648,7 +648,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -862,7 +862,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -870,7 +870,7 @@ sarl $2, %eax jle .L35 ALIGN_4 - + .L32: #ifdef PENTIUM4 #ifdef HAVE_SSE3 @@ -1002,7 +1002,7 @@ decl %eax jne .L36 ALIGN_4 - + .L40: movl C, %esi # coffset = c movl A, AA # aoffset = a @@ -1023,7 +1023,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1050,7 +1050,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1220,7 +1220,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1239,7 +1239,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1384,14 +1384,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L65 ALIGN_4 - + .L62: #ifdef PENTIUM4 #ifdef HAVE_SSE3 @@ -1512,7 +1512,7 @@ decl %eax jne .L66 ALIGN_4 - + .L70: movl C, %esi # coffset = c movl A, AA # aoffset = a @@ -1533,7 +1533,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1558,7 +1558,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1679,7 +1679,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1698,7 +1698,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1793,7 +1793,7 @@ .L999: movl OLD_STACK, %esp EMMS - + popl %ebx popl %esi popl %edi diff --git a/kernel/x86/zgemm3m_kernel_2x4_penryn.S b/kernel/x86/zgemm3m_kernel_2x4_penryn.S index 392064987..f3e94a62d 100644 --- a/kernel/x86/zgemm3m_kernel_2x4_penryn.S +++ b/kernel/x86/zgemm3m_kernel_2x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -103,7 +103,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -122,7 +122,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 2, %eax @@ -176,7 +176,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -481,7 +481,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -653,7 +653,7 @@ movlps %xmm1, 0 * SIZE(%eax, LDC) movhps %xmm1, 1 * SIZE(%eax, LDC) ALIGN_4 - + .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -675,7 +675,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -713,7 +713,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -917,7 +917,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1041,7 +1041,7 @@ movlps %xmm1, 0 * SIZE(C1, LDC) movhps %xmm1, 1 * SIZE(C1, LDC) ALIGN_4 - + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -1061,7 +1061,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -1096,7 +1096,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1246,7 +1246,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1331,7 +1331,7 @@ movlps %xmm0, 0 * SIZE(C1) movhps %xmm0, 1 * SIZE(C1) ALIGN_4 - + .L999: popl %ebx popl %esi diff --git a/kernel/x86/zgemm3m_kernel_2x4_prescott.S b/kernel/x86/zgemm3m_kernel_2x4_prescott.S index a32e0ae94..f7cd2ae44 100644 --- a/kernel/x86/zgemm3m_kernel_2x4_prescott.S +++ b/kernel/x86/zgemm3m_kernel_2x4_prescott.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -207,7 +207,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -223,7 +223,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 2, %eax @@ -249,7 +249,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB -#endif +#endif movl BX, %eax prefetcht2 0 * SIZE(%eax) @@ -278,7 +278,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -293,7 +293,7 @@ andl $-8, %eax sall $4, %eax je .L15 - + .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) @@ -715,7 +715,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 4), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -731,7 +731,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -944,7 +944,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) @@ -962,7 +962,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -988,7 +988,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1155,7 +1155,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 2), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1171,7 +1171,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1314,7 +1314,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl M, %ebx sarl $1, %ebx # i = (m >> 2) @@ -1332,7 +1332,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 1), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1352,7 +1352,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1469,7 +1469,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 1), AA leal (B, %eax, 1), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1485,7 +1485,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_4x2_core2.S b/kernel/x86/zgemm3m_kernel_4x2_core2.S index 0c01de87e..00f440982 100644 --- a/kernel/x86/zgemm3m_kernel_4x2_core2.S +++ b/kernel/x86/zgemm3m_kernel_4x2_core2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define OLD_M 4 + STACK + ARGS(%esi) #define OLD_N 8 + STACK + ARGS(%esi) #define OLD_K 12 + STACK + ARGS(%esi) @@ -130,13 +130,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 - + .L02: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -183,7 +183,7 @@ decl %eax jne .L06 ALIGN_4 - + .L10: movl B, BX @@ -229,7 +229,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -517,7 +517,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $2, %eax @@ -685,7 +685,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -828,13 +828,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $3, %eax jle .L45 ALIGN_4 - + .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -876,7 +876,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, C1 movl A, AA @@ -914,7 +914,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1086,7 +1086,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1219,7 +1219,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -1316,7 +1316,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/zgemm3m_kernel_4x2_northwood.S b/kernel/x86/zgemm3m_kernel_4x2_northwood.S index fb7d63954..883a874c6 100644 --- a/kernel/x86/zgemm3m_kernel_4x2_northwood.S +++ b/kernel/x86/zgemm3m_kernel_4x2_northwood.S @@ -47,7 +47,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -58,7 +58,7 @@ #define STACK_C 40 + STACK + ARGS(%esi) #define STACK_LDC 44 + STACK + ARGS(%esi) #define STACK_OFFT 48 + STACK + ARGS(%esi) - + #define ALPHA 0(%esp) #define K 16(%esp) #define N 20(%esp) @@ -243,7 +243,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -252,12 +252,12 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -265,7 +265,7 @@ sarl $2, %eax jle .L03 ALIGN_2 - + .L02: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -326,7 +326,7 @@ BRANCH jne .L04 ALIGN_4 - + .L05: movl B, BX @@ -370,7 +370,7 @@ movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif prefetchnta 3 * SIZE(%esi) prefetchnta 3 * SIZE(%esi, LDC) @@ -385,7 +385,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -403,7 +403,7 @@ je .L12 sall $3, %eax .align 8 - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -496,7 +496,7 @@ subl $64 * 8, %eax BRANCH jg .L1X - + .L11: leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB @@ -505,7 +505,7 @@ sarl $3, %eax je .L12 -.L11: +.L11: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -550,7 +550,7 @@ addl $4 * SIZE, BB # boffset1 += 8 subl $1, %eax jg .L13 - ALIGN_4 + ALIGN_4 .L14: movsd 0 * SIZE(%esi), %xmm0 @@ -668,14 +668,14 @@ pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $2, %eax @@ -684,7 +684,7 @@ sarl $3, %eax je .L32 -.L31: +.L31: mulpd %xmm0, %xmm2 mulpd 2 * SIZE(BB), %xmm0 addpd %xmm2, %xmm4 @@ -846,14 +846,14 @@ pxor %xmm6, %xmm6 movsd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -866,7 +866,7 @@ sarl $3, %eax je .L52 -.L51: +.L51: mulsd %xmm0, %xmm2 mulsd 2 * SIZE(BB), %xmm0 addsd %xmm2, %xmm4 @@ -988,12 +988,12 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -1002,7 +1002,7 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: movsd 0 * SIZE(B), %xmm0 movsd 1 * SIZE(B), %xmm1 @@ -1059,7 +1059,7 @@ decl %eax jne .L104 ALIGN_4 - + .L105: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1099,7 +1099,7 @@ pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL @@ -1107,7 +1107,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1120,7 +1120,7 @@ sarl $3, %eax je .L112 -.L111: +.L111: mulpd %xmm2, %xmm0 mulpd 2 * SIZE(AA), %xmm2 addpd %xmm0, %xmm4 @@ -1244,7 +1244,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -1281,7 +1281,7 @@ pxor %xmm6, %xmm6 movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL @@ -1289,7 +1289,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1302,7 +1302,7 @@ sarl $3, %eax je .L132 -.L131: +.L131: mulpd %xmm0, %xmm2 movapd 2 * SIZE(AA), %xmm0 addpd %xmm2, %xmm4 @@ -1420,14 +1420,14 @@ pxor %xmm6, %xmm6 movapd 4 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -1436,7 +1436,7 @@ sarl $3, %eax je .L152 -.L151: +.L151: mulsd %xmm0, %xmm2 movsd 1 * SIZE(AA), %xmm0 addsd %xmm2, %xmm4 diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S index 623f0beec..fcdc334a8 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S +++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -207,7 +207,7 @@ andl $-1024, %esp # align stack STACK_TOUCHING - + movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx @@ -235,7 +235,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -247,7 +247,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -256,7 +256,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) @@ -312,7 +312,7 @@ addl $4 * SIZE, %edi ALIGN_4 - + .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -333,7 +333,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -356,7 +356,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -370,7 +370,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -592,7 +592,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -608,7 +608,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -837,7 +837,7 @@ leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -853,7 +853,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1052,14 +1052,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 - + .L42: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) @@ -1116,7 +1116,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1137,7 +1137,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1157,7 +1157,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1325,7 +1325,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1344,7 +1344,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1501,7 +1501,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1520,7 +1520,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1667,7 +1667,7 @@ addl $2, KK #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C ALIGN_4 .L80: @@ -1677,7 +1677,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1685,7 +1685,7 @@ sarl $3, %eax jle .L85 ALIGN_4 - + .L82: prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) @@ -1738,7 +1738,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1759,7 +1759,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1778,7 +1778,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1901,7 +1901,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1920,7 +1920,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2034,7 +2034,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2053,7 +2053,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_4x4_opteron.S b/kernel/x86/zgemm3m_kernel_4x4_opteron.S index 511fc8b05..70c3dd892 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_opteron.S +++ b/kernel/x86/zgemm3m_kernel_4x4_opteron.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -204,7 +204,7 @@ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; #endif - + #ifdef PENTIUM4 #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ @@ -330,7 +330,7 @@ PROFCODE EMMS - + movl %esp, %esi # save old stack subl $128 + LOCAL_BUFFER_SIZE, %esp movl OLD_M, %ebx @@ -371,7 +371,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -380,7 +380,7 @@ sarl $1, %eax jle .L05 ALIGN_4 - + .L02: #ifdef HAVE_SSE2 movss 0 * SIZE(%edi), %xmm0 @@ -516,7 +516,7 @@ #endif addl $4 * SIZE, %edi ALIGN_4 - + .L10: movl %edi, BX @@ -539,7 +539,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movl BX, %eax @@ -599,7 +599,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -614,7 +614,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -858,7 +858,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -874,7 +874,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1103,7 +1103,7 @@ leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB leal (BB, %eax, 8), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1119,7 +1119,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1321,14 +1321,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $2, %eax jle .L45 ALIGN_4 - + .L42: prefetchnta 80 * SIZE(%edi) @@ -1453,7 +1453,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1474,7 +1474,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1499,7 +1499,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1667,7 +1667,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1686,7 +1686,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1843,7 +1843,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1862,7 +1862,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1981,7 +1981,7 @@ addl $2, KK #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C ALIGN_4 .L80: @@ -1991,14 +1991,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 - + .L82: prefetchnta 80 * SIZE(%edi) @@ -2112,7 +2112,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -2133,7 +2133,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2156,7 +2156,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2279,7 +2279,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2298,7 +2298,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2411,7 +2411,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -2430,7 +2430,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_4x4_penryn.S b/kernel/x86/zgemm3m_kernel_4x4_penryn.S index 802298cf2..df38500bb 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_penryn.S +++ b/kernel/x86/zgemm3m_kernel_4x4_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -86,7 +86,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -105,7 +105,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sall $BASE_SHIFT + 2, %eax @@ -160,7 +160,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -445,7 +445,7 @@ movhps %xmm0, 2 * SIZE(%eax, LDC) movlps %xmm1, 4 * SIZE(%eax, LDC) movhps %xmm1, 6 * SIZE(%eax, LDC) - + addl $8 * SIZE, C1 decl I jg .L11 @@ -481,7 +481,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -693,7 +693,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -836,7 +836,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -877,7 +877,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1079,7 +1079,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1232,7 +1232,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1359,7 +1359,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -1396,7 +1396,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1549,7 +1549,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1687,7 +1687,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_4x4_prescott.S b/kernel/x86/zgemm3m_kernel_4x4_prescott.S index 3d602e3e4..bdb19e12f 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_prescott.S +++ b/kernel/x86/zgemm3m_kernel_4x4_prescott.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 16 - + #define OLD_M 4 + STACK(%esi) #define OLD_N 8 + STACK(%esi) #define OLD_K 12 + STACK(%esi) @@ -248,7 +248,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -261,7 +261,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -270,7 +270,7 @@ sarl $2, %eax jle .L05 ALIGN_4 - + .L02: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 @@ -318,7 +318,7 @@ decl %eax jne .L06 ALIGN_4 - + .L10: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -339,7 +339,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -362,7 +362,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -377,7 +377,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -703,7 +703,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -717,7 +717,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -900,7 +900,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -914,7 +914,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1054,14 +1054,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L45 ALIGN_4 - + .L42: movddup 0 * SIZE(%edi), %xmm0 movddup 2 * SIZE(%edi), %xmm1 @@ -1106,7 +1106,7 @@ decl %eax jne .L46 ALIGN_4 - + .L50: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1127,7 +1127,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1146,7 +1146,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1311,7 +1311,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1327,7 +1327,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1455,7 +1455,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1471,7 +1471,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1578,7 +1578,7 @@ addl $2, KK #endif leal (, LDC, 2), %eax - addl %eax, C + addl %eax, C ALIGN_4 .L80: @@ -1588,14 +1588,14 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax leal BUFFER, %ecx sarl $3, %eax jle .L85 ALIGN_4 - + .L82: movss 0 * SIZE(%edi), %xmm0 movss 1 * SIZE(%edi), %xmm1 @@ -1649,7 +1649,7 @@ decl %eax jne .L86 ALIGN_4 - + .L90: movl C, %esi # coffset = c movl A, %edx # aoffset = a @@ -1670,7 +1670,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 1), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1690,7 +1690,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1811,7 +1811,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 1), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1830,7 +1830,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1942,7 +1942,7 @@ leal (, %eax, 4), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif movss 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1958,7 +1958,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm3m_kernel_8x2_core2.S b/kernel/x86/zgemm3m_kernel_8x2_core2.S index 9a28c8ec3..d387dd14b 100644 --- a/kernel/x86/zgemm3m_kernel_8x2_core2.S +++ b/kernel/x86/zgemm3m_kernel_8x2_core2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -83,7 +83,7 @@ #else #define MOVSD movsd #endif - + PROLOGUE pushl %ebp @@ -140,13 +140,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax jle .L05 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movss -32 * SIZE(B), %xmm0 @@ -205,7 +205,7 @@ decl %eax jne .L06 ALIGN_4 - + .L10: movl C, C1 movl A, AA @@ -242,7 +242,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -523,7 +523,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -697,7 +697,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -853,7 +853,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -989,13 +989,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $3, %eax jle .L55 ALIGN_4 - + .L52: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 @@ -1047,7 +1047,7 @@ decl %eax jne .L56 ALIGN_4 - + .L60: movl C, C1 movl A, AA @@ -1085,7 +1085,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1259,7 +1259,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1393,7 +1393,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1518,7 +1518,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1616,7 +1616,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/zgemm3m_kernel_8x2_sse.S b/kernel/x86/zgemm3m_kernel_8x2_sse.S index ea66dc1ae..24ec02739 100644 --- a/kernel/x86/zgemm3m_kernel_8x2_sse.S +++ b/kernel/x86/zgemm3m_kernel_8x2_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -249,7 +249,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -258,12 +258,12 @@ movl %eax, J jle .L100 ALIGN_2 - + .L01: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -271,7 +271,7 @@ sarl $2, %eax jle .L03 ALIGN_4 - + .L02: movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 @@ -301,7 +301,7 @@ movaps %xmm7, 28 * SIZE(%ecx) prefetcht0 104 * SIZE(B) - + addl $ 8 * SIZE, B addl $32 * SIZE, %ecx decl %eax @@ -369,7 +369,7 @@ XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif prefetchnta 7 * SIZE(%esi) prefetchnta 7 * SIZE(%esi, %ebp) @@ -379,7 +379,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -393,7 +393,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -520,7 +520,7 @@ XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif prefetchnta 8 * SIZE(%esi) prefetchnta 8 * SIZE(%esi, %ebp) @@ -530,7 +530,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -838,7 +838,7 @@ BRANCH decl %ebx # i -- jg .L10 - ALIGN_2 + ALIGN_2 .L30: movl M, %ebx @@ -877,14 +877,14 @@ XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -983,14 +983,14 @@ XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1139,7 +1139,7 @@ movhps %xmm1, 6 * SIZE(%esi, LDC) addl $8 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L50: testl $2, %ebx @@ -1175,14 +1175,14 @@ XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1289,14 +1289,14 @@ XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1436,7 +1436,7 @@ movhps %xmm0, 2 * SIZE(%esi, LDC) addl $4 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L70: testl $1, %ebx @@ -1471,14 +1471,14 @@ XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1577,14 +1577,14 @@ XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1709,7 +1709,7 @@ addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi, LDC) - ALIGN_2 + ALIGN_2 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1728,12 +1728,12 @@ testl $1, %eax jle .L999 ALIGN_2 - + .L101: #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ movl K, %eax @@ -1741,10 +1741,10 @@ sarl $3, %eax jle .L103 ALIGN_4 - + .L102: prefetchnta 96 * SIZE(B) - + movss 0 * SIZE(B), %xmm0 movss 1 * SIZE(B), %xmm1 movss 2 * SIZE(B), %xmm2 @@ -1836,14 +1836,14 @@ XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1943,14 +1943,14 @@ XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2102,7 +2102,7 @@ BRANCH decl %ebx # i -- jg .L110 - ALIGN_2 + ALIGN_2 .L130: movl M, %ebx @@ -2141,14 +2141,14 @@ XORPS %xmm6, %xmm6 movaps 16 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2225,14 +2225,14 @@ XORPS %xmm6, %xmm6 movaps 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2336,7 +2336,7 @@ movhps %xmm1, 6 * SIZE(%esi) addl $8 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L150: testl $2, %ebx @@ -2371,14 +2371,14 @@ XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2461,14 +2461,14 @@ XORPS %xmm6, %xmm6 MOVSD 8 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2567,7 +2567,7 @@ movhps %xmm0, 2 * SIZE(%esi) addl $4 * SIZE, %esi - ALIGN_2 + ALIGN_2 .L170: testl $1, %ebx @@ -2602,14 +2602,14 @@ XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2686,14 +2686,14 @@ XORPS %xmm6, %xmm6 movss 4 * SIZE(AA), %xmm1 XORPS %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -2785,7 +2785,7 @@ addps %xmm2, %xmm0 movlps %xmm0, 0 * SIZE(%esi) - ALIGN_2 + ALIGN_2 .L999: movl OLD_STACK, %esp diff --git a/kernel/x86/zgemm_beta.S b/kernel/x86/zgemm_beta.S index c36e7c508..a66b45c2e 100644 --- a/kernel/x86/zgemm_beta.S +++ b/kernel/x86/zgemm_beta.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #ifdef DOUBLE diff --git a/kernel/x86/zgemm_kernel_1x1.S b/kernel/x86/zgemm_kernel_1x1.S index 117b245e2..4df46ddb1 100644 --- a/kernel/x86/zgemm_kernel_1x1.S +++ b/kernel/x86/zgemm_kernel_1x1.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define BX 0 + STACK(%esp) #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) @@ -105,7 +105,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl %ebx, BX @@ -125,7 +125,7 @@ leal (, %eax, SIZE), %eax leal (A, %eax, 2), A leal (B, %eax, 2), B -#endif +#endif #ifdef HAVE_SSE movl BX, %eax @@ -169,7 +169,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT diff --git a/kernel/x86/zgemm_kernel_1x1_atom.S b/kernel/x86/zgemm_kernel_1x1_atom.S index 5d276b943..1441c652c 100644 --- a/kernel/x86/zgemm_kernel_1x1_atom.S +++ b/kernel/x86/zgemm_kernel_1x1_atom.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -57,7 +57,7 @@ #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) - + #define PREFETCH prefetcht0 #define PREFETCHSIZE 84 @@ -107,7 +107,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -127,7 +127,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl B, BX @@ -150,7 +150,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB -#endif +#endif movl BX, %eax prefetcht0 0 * SIZE(%eax) @@ -171,7 +171,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -327,7 +327,7 @@ addl $2 * SIZE, CO1 decl %ebx jg .L10 - ALIGN_4 + ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) diff --git a/kernel/x86/zgemm_kernel_1x2.S b/kernel/x86/zgemm_kernel_1x2.S index 0f9806974..0d7e99354 100644 --- a/kernel/x86/zgemm_kernel_1x2.S +++ b/kernel/x86/zgemm_kernel_1x2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define J 0 + STACK(%esp) #define I 4 + STACK(%esp) #define KK 8 + STACK(%esp) @@ -331,7 +331,7 @@ movl STACK_LDC, LDC sall $ZBASE_SHIFT, LDC - + subl $(AOFFSET - 16 * SIZE), STACK_A subl $(BOFFSET - 16 * SIZE), STACK_B @@ -346,7 +346,7 @@ movl K, %eax testl %eax, %eax jle .L999 - + movl N, %eax sarl $1, %eax movl %eax, J @@ -357,7 +357,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl STACK_B, B @@ -411,7 +411,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -510,7 +510,7 @@ FMUL ALPHA_R fxch %st(1) FMUL ALPHA_I - faddp %st, %st(1) + faddp %st, %st(1) #ifndef TRMMKERNEL FADD 1 * SIZE(%edi) @@ -531,7 +531,7 @@ FMUL ALPHA_R fxch %st(1) FMUL ALPHA_I - faddp %st, %st(1) + faddp %st, %st(1) #ifndef TRMMKERNEL FADD 1 * SIZE(%edi,LDC) @@ -580,7 +580,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl STACK_A, A movl STACK_B, B @@ -617,7 +617,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -765,7 +765,7 @@ FMUL ALPHA_R fxch %st(1) FMUL ALPHA_I - faddp %st, %st(1) + faddp %st, %st(1) #ifndef TRMMKERNEL FADD 1 * SIZE(%edi) diff --git a/kernel/x86/zgemm_kernel_1x2_3dnow.S b/kernel/x86/zgemm_kernel_1x2_3dnow.S index 3699bb25d..f312a9b66 100644 --- a/kernel/x86/zgemm_kernel_1x2_3dnow.S +++ b/kernel/x86/zgemm_kernel_1x2_3dnow.S @@ -99,7 +99,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla andl $-1024, %esp # align stack STACK_TOUCHING - + movl OLD_N, %eax movl OLD_K, %ecx movl OLD_A, %edx @@ -172,7 +172,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax @@ -307,7 +307,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -619,7 +619,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla addl $2 * SIZE, %esi decl %ebx jg .L11 - ALIGN_4 + ALIGN_4 .L19: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -708,7 +708,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, %esi # coffset = c movl A, AA # aoffset = a @@ -744,7 +744,7 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -943,11 +943,11 @@ https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=fla addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L31 - ALIGN_4 + ALIGN_4 .L999: EMMS - + movl OLD_STACK, %esp popl %ebx popl %esi diff --git a/kernel/x86/zgemm_kernel_1x2_barcelona.S b/kernel/x86/zgemm_kernel_1x2_barcelona.S index f71b095ad..41b65946f 100644 --- a/kernel/x86/zgemm_kernel_1x2_barcelona.S +++ b/kernel/x86/zgemm_kernel_1x2_barcelona.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -219,7 +219,7 @@ #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -239,7 +239,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax movl %eax, BX @@ -262,7 +262,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 4), BO -#endif +#endif movl BX, %eax @@ -287,7 +287,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -496,7 +496,7 @@ addl $2 * SIZE, CO # coffset += 4 decl I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -521,7 +521,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, CO movl A, AO @@ -542,7 +542,7 @@ leal (, %eax, SIZE), %eax leal (AO, %eax, 2), AO leal (B, %eax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -558,7 +558,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -713,7 +713,7 @@ addl $2 * SIZE, CO # coffset += 4 decl I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L500: popl %ebx diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S index 70b38dc79..adbadefb5 100644 --- a/kernel/x86/zgemm_kernel_1x2_penryn.S +++ b/kernel/x86/zgemm_kernel_1x2_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -125,13 +125,13 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif movl M, %ebx testl %ebx, %ebx - jle .L999 + jle .L999 subl $-16 * SIZE, A subl $-16 * SIZE, B @@ -148,7 +148,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl B, BX @@ -169,7 +169,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif movl BX, %eax PREFETCHB -16 * SIZE(%eax) @@ -193,7 +193,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -420,7 +420,7 @@ addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 @@ -468,7 +468,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 # coffset = c movl A, AA # aoffset = a @@ -487,7 +487,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -505,7 +505,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -663,7 +663,7 @@ mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 addsubpd %xmm5, %xmm4 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 #endif diff --git a/kernel/x86/zgemm_kernel_1x2_sse2.S b/kernel/x86/zgemm_kernel_1x2_sse2.S index 63fc30a5b..e621e4acb 100644 --- a/kernel/x86/zgemm_kernel_1x2_sse2.S +++ b/kernel/x86/zgemm_kernel_1x2_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -242,7 +242,7 @@ movlpd %xmm0, 0 + ALPHA_R movlpd %xmm0, 8 + ALPHA_R - + movlpd %xmm1, 8 + ALPHA_I xorpd %xmm7, %xmm1 movlpd %xmm1, 0 + ALPHA_I @@ -258,7 +258,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -272,7 +272,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -362,7 +362,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -381,7 +381,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -396,7 +396,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -576,7 +576,7 @@ pshufd $0x4e, %xmm4, %xmm5 pshufd $0x4e, %xmm6, %xmm7 - + mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 mulpd %xmm2, %xmm6 @@ -611,7 +611,7 @@ addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -634,7 +634,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx movapd POSINV, %xmm7 @@ -705,7 +705,7 @@ movl C, %esi # coffset = c movl A, AA # aoffset = a movl M, %ebx - testl %ebx, %ebx + testl %ebx, %ebx jle .L500 ALIGN_4 @@ -721,7 +721,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -738,7 +738,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -864,7 +864,7 @@ #endif pshufd $0x4e, %xmm4, %xmm5 - + mulpd %xmm2, %xmm4 mulpd %xmm3, %xmm5 @@ -893,13 +893,13 @@ addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L500: movl OLD_STACK, %esp EMMS - + popl %ebx popl %esi popl %edi diff --git a/kernel/x86/zgemm_kernel_1x2_sse3.S b/kernel/x86/zgemm_kernel_1x2_sse3.S index 70e640097..774cb0ff6 100644 --- a/kernel/x86/zgemm_kernel_1x2_sse3.S +++ b/kernel/x86/zgemm_kernel_1x2_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -57,7 +57,7 @@ #define BX 4 + STACK(%esp) #define KK 8 + STACK(%esp) #define KKK 12 + STACK(%esp) - + #ifdef PENTIUM4 #define PREFETCH_R (8 * 4) #define PREFETCH prefetcht1 @@ -222,7 +222,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -238,7 +238,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl B, BX @@ -261,7 +261,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 4), BB -#endif +#endif movl BX, %eax @@ -289,7 +289,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -304,7 +304,7 @@ andl $-8, %eax sall $4, %eax je .L12 - + .L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) @@ -578,7 +578,7 @@ addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 - + #ifndef TRMMKERNEL movsd 0 * SIZE(%esi), %xmm0 movhpd 1 * SIZE(%esi), %xmm0 @@ -610,7 +610,7 @@ addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -635,10 +635,10 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl M, %ebx - testl %ebx, %ebx + testl %ebx, %ebx jle .L500 ALIGN_4 @@ -653,7 +653,7 @@ L110: leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (B, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -673,7 +673,7 @@ L110: #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -843,7 +843,7 @@ L114: addl $2 * SIZE, %esi # coffset += 4 decl %ebx # i -- jg L110 - ALIGN_4 + ALIGN_4 .L500: popl %ebx diff --git a/kernel/x86/zgemm_kernel_2x1_core2.S b/kernel/x86/zgemm_kernel_2x1_core2.S index 3ed53425f..8b3e9f386 100644 --- a/kernel/x86/zgemm_kernel_2x1_core2.S +++ b/kernel/x86/zgemm_kernel_2x1_core2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -145,7 +145,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif subl $-16 * SIZE, A @@ -163,7 +163,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax @@ -191,7 +191,7 @@ movapd %xmm5, -6 * SIZE(BB) movapd %xmm6, -4 * SIZE(BB) movapd %xmm7, -2 * SIZE(BB) - + addl $ 8 * SIZE, B subl $-16 * SIZE, BB decl %eax @@ -242,7 +242,7 @@ leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -264,7 +264,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -481,7 +481,7 @@ addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 addpd %xmm1, %xmm6 @@ -527,7 +527,7 @@ leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -543,7 +543,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -662,7 +662,7 @@ mulpd %xmm3, %xmm5 addsubpd %xmm5, %xmm4 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm4 #endif diff --git a/kernel/x86/zgemm_kernel_2x1_sse2.S b/kernel/x86/zgemm_kernel_2x1_sse2.S index 3ef96d143..54c205bed 100644 --- a/kernel/x86/zgemm_kernel_2x1_sse2.S +++ b/kernel/x86/zgemm_kernel_2x1_sse2.S @@ -47,7 +47,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -239,7 +239,7 @@ movsd %xmm0, 0 + ALPHA_R movsd %xmm0, 8 + ALPHA_R - + movsd %xmm1, 8 + ALPHA_I xorpd %xmm7, %xmm1 movsd %xmm1, 0 + ALPHA_I @@ -264,7 +264,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -277,7 +277,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB movapd POSINV, %xmm7 @@ -299,7 +299,7 @@ unpcklpd %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 #else @@ -323,7 +323,7 @@ unpcklpd %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorpd %xmm7, %xmm1 xorpd %xmm7, %xmm3 #else @@ -337,7 +337,7 @@ movapd %xmm3, 14 * SIZE(BB) prefetcht0 104 * SIZE(B) - + addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax @@ -359,7 +359,7 @@ unpcklpd %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorpd %xmm7, %xmm1 #else xorpd %xmm7, %xmm0 @@ -422,7 +422,7 @@ movapd 8 * SIZE(AA), %xmm1 pxor %xmm7, %xmm7 -#endif +#endif prefetchnta 3 * SIZE(%esi) @@ -431,7 +431,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -445,7 +445,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -542,7 +542,7 @@ .L11: leal (BB, %eax, 4), BB leal (AA, %eax, 4), AA - + .L12: #ifndef TRMMKERNEL movl K, %eax @@ -670,7 +670,7 @@ movapd 8 * SIZE(BB), %xmm2 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL @@ -678,7 +678,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax diff --git a/kernel/x86/zgemm_kernel_2x2_barcelona.S b/kernel/x86/zgemm_kernel_2x2_barcelona.S index 2ad68935c..21f7469f8 100644 --- a/kernel/x86/zgemm_kernel_2x2_barcelona.S +++ b/kernel/x86/zgemm_kernel_2x2_barcelona.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -270,7 +270,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -283,7 +283,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -306,7 +306,7 @@ pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -329,7 +329,7 @@ pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -367,7 +367,7 @@ pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -403,7 +403,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -422,7 +422,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -436,7 +436,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -526,7 +526,7 @@ leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB ALIGN_4 - + .L15: #ifndef TRMMKERNEL movl K, %eax @@ -641,7 +641,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -658,7 +658,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -776,7 +776,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #ifndef TRMMKERNEL movl K, %eax @@ -887,7 +887,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -909,7 +909,7 @@ pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -932,7 +932,7 @@ pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -967,7 +967,7 @@ pshufd $0x55, %xmm3, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 #else xorps %xmm7, %xmm0 @@ -1002,7 +1002,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1021,7 +1021,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1098,7 +1098,7 @@ decl %eax jne .L111 ALIGN_4 - + .L112: #ifndef TRMMKERNEL movl K, %eax @@ -1196,7 +1196,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1212,7 +1212,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1289,7 +1289,7 @@ decl %eax jne .L141 ALIGN_4 - + .L142: #ifndef TRMMKERNEL movl K, %eax diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S index 715eb4d4f..f50117b6a 100644 --- a/kernel/x86/zgemm_kernel_2x2_penryn.S +++ b/kernel/x86/zgemm_kernel_2x2_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -125,7 +125,7 @@ movl OFFSET, %eax #ifndef LEFT negl %eax -#endif +#endif movl %eax, KK #endif @@ -144,7 +144,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl B, BX @@ -168,7 +168,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB -#endif +#endif movl BX, %eax PREFETCHB -32 * SIZE(%eax) @@ -192,7 +192,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -331,11 +331,11 @@ movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 - + decl %eax jne .L12 ALIGN_4 - + .L15: #ifndef TRMMKERNEL movl K, %eax @@ -449,7 +449,7 @@ decl %ebx jg .L10 ALIGN_4 - + .L20: movl M, %ebx testl $1, %ebx @@ -466,7 +466,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -483,7 +483,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -630,11 +630,11 @@ subl $-16 * SIZE, AA subl $-32 * SIZE, BB - + decl %eax jne .L22 ALIGN_4 - + .L25: #ifndef TRMMKERNEL movl K, %eax @@ -757,7 +757,7 @@ addl $2, KK #endif movl BB, B - + leal (, LDC, 2), %eax addl %eax, C @@ -773,7 +773,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl C, C1 movl A, AA @@ -795,7 +795,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 2), BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -813,7 +813,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -901,11 +901,11 @@ subl $-32 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L32 ALIGN_4 - + .L35: #ifndef TRMMKERNEL movl K, %eax @@ -997,7 +997,7 @@ decl %ebx jg .L31 ALIGN_4 - + .L40: movl M, %ebx testl $1, %ebx @@ -1014,7 +1014,7 @@ leal (, %eax, SIZE), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1031,7 +1031,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1122,11 +1122,11 @@ subl $-16 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L42 ALIGN_4 - + .L45: #ifndef TRMMKERNEL movl K, %eax diff --git a/kernel/x86/zgemm_kernel_2x2_sse.S b/kernel/x86/zgemm_kernel_2x2_sse.S index fad42ccb9..c0fba7820 100644 --- a/kernel/x86/zgemm_kernel_2x2_sse.S +++ b/kernel/x86/zgemm_kernel_2x2_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -219,7 +219,7 @@ addps %xmm1, %xmm7; \ movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; #endif - + #ifdef PENTIUM4 #define KERNEL1(address) \ mulps %xmm0, %xmm2; \ @@ -409,7 +409,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -422,7 +422,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -445,7 +445,7 @@ shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -469,7 +469,7 @@ shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -512,7 +512,7 @@ shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -547,7 +547,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 8), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -573,7 +573,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -589,7 +589,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -700,7 +700,7 @@ jne .L11 ALIGN_4 #endif - + .L15: #ifndef TRMMKERNEL movl K, %eax @@ -815,7 +815,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 8), BB -#endif +#endif #ifdef movsd xorps %xmm0, %xmm0 @@ -837,7 +837,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -957,7 +957,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #ifndef TRMMKERNEL movl K, %eax @@ -1074,7 +1074,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -1097,7 +1097,7 @@ shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -1121,7 +1121,7 @@ shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -1157,7 +1157,7 @@ shufps $0, %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 #else xorps %xmm7, %xmm0 @@ -1192,7 +1192,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1217,7 +1217,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1294,7 +1294,7 @@ decl %eax jne .L111 ALIGN_4 - + .L112: #ifndef TRMMKERNEL movl K, %eax @@ -1392,7 +1392,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -1408,7 +1408,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1485,7 +1485,7 @@ decl %eax jne .L141 ALIGN_4 - + .L142: #ifndef TRMMKERNEL movl K, %eax diff --git a/kernel/x86/zgemm_kernel_2x2_sse3.S b/kernel/x86/zgemm_kernel_2x2_sse3.S index 23afa8f21..4bca5ff68 100644 --- a/kernel/x86/zgemm_kernel_2x2_sse3.S +++ b/kernel/x86/zgemm_kernel_2x2_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -268,7 +268,7 @@ movss %xmm4, KK #ifndef LEFT negl KK -#endif +#endif #endif sall $ZBASE_SHIFT, LDC @@ -281,7 +281,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -360,7 +360,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -379,7 +379,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -395,7 +395,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -588,7 +588,7 @@ jne .L11 ALIGN_4 #endif - + .L15: #ifndef TRMMKERNEL movl K, %eax @@ -714,7 +714,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -728,7 +728,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -822,7 +822,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #ifndef TRMMKERNEL movl K, %eax @@ -859,12 +859,12 @@ movhlps %xmm6, %xmm5 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) + defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm5 -#endif - +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 @@ -934,7 +934,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif /* Copying to Sub Buffer */ leal BUFFER, %ecx @@ -1009,7 +1009,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 2), BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1029,7 +1029,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1107,7 +1107,7 @@ decl %eax jne .L111 ALIGN_4 - + .L112: #ifndef TRMMKERNEL movl K, %eax @@ -1208,7 +1208,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 2), BB -#endif +#endif movddup 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -1222,7 +1222,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1284,7 +1284,7 @@ decl %eax jne .L141 ALIGN_4 - + .L142: #ifndef TRMMKERNEL movl K, %eax @@ -1317,12 +1317,12 @@ movhlps %xmm4, %xmm5 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) + defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm5 -#endif - +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm5, %xmm5 diff --git a/kernel/x86/zgemm_kernel_4x1_core2.S b/kernel/x86/zgemm_kernel_4x1_core2.S index ca232e447..05c2f0276 100644 --- a/kernel/x86/zgemm_kernel_4x1_core2.S +++ b/kernel/x86/zgemm_kernel_4x1_core2.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -111,7 +111,7 @@ addl $STACK_OFFSET, %esp STACK_TOUCHING - + movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 @@ -134,7 +134,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif subl $-32 * SIZE, A @@ -166,7 +166,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif movl K, %eax sarl $2, %eax @@ -250,7 +250,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 4), AA leal (BB, %eax, 4), BB /* because it's doubled */ -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -266,7 +266,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -395,11 +395,11 @@ ADDSUB %xmm1, %xmm7 movaps -32 * SIZE(BB), %xmm1 - + decl %eax jne .L12 ALIGN_4 - + .L15: #ifndef TRMMKERNEL movl K, %eax @@ -502,7 +502,7 @@ decl %ebx jg .L10 ALIGN_2 - + .L20: movl M, %ebx testl $2, %ebx @@ -520,7 +520,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 2), AA leal (BB, %eax, 4), BB /* because it's doubled */ -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -536,7 +536,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -695,7 +695,7 @@ leal (, %eax, 8), %eax leal (AA, %eax, 1), AA leal (BB, %eax, 4), BB /* because it's doubled */ -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -712,7 +712,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -860,7 +860,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/zgemm_kernel_4x1_sse.S b/kernel/x86/zgemm_kernel_4x1_sse.S index 6c514639c..685e5d3d7 100644 --- a/kernel/x86/zgemm_kernel_4x1_sse.S +++ b/kernel/x86/zgemm_kernel_4x1_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -243,7 +243,7 @@ movd %mm4, KK #ifndef LEFT negl KK -#endif +#endif #endif leal (, LDC, SIZE * 2), LDC @@ -292,7 +292,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movl OFFSET, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB movaps POSINV, %xmm7 @@ -313,7 +313,7 @@ shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -337,7 +337,7 @@ shufps $0, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 xorps %xmm7, %xmm3 #else @@ -371,7 +371,7 @@ shufps $0, %xmm1, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm1 #else xorps %xmm7, %xmm0 @@ -429,7 +429,7 @@ movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif prefetchnta 8 * SIZE(%esi) @@ -438,7 +438,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -451,7 +451,7 @@ andl $-8, %eax je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -582,14 +582,14 @@ movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -764,7 +764,7 @@ decl %eax jne .L11 #endif - + .L12: #ifndef TRMMKERNEL movl K, %eax @@ -859,7 +859,7 @@ decl %ebx # i -- jg .L10 ALIGN_2 - + .L50: movl M, %ebx testl $2, %ebx @@ -899,14 +899,14 @@ movaps 16 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1009,14 +1009,14 @@ movaps 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax #ifdef LEFT @@ -1085,7 +1085,7 @@ decl %eax jne .L51 #endif - + .L52: #ifndef TRMMKERNEL movl K, %eax @@ -1208,14 +1208,14 @@ movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL movl K, %eax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -1327,7 +1327,7 @@ movsd 8 * SIZE(AA), %xmm1 xorps %xmm7, %xmm7 -#endif +#endif #ifndef TRMMKERNEL @@ -1335,7 +1335,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movl K, %eax subl KK, %eax - movl %eax, KKK + movl %eax, KKK #else movl KK, %eax addl $1, %eax @@ -1409,7 +1409,7 @@ jne .L71 ALIGN_2 #endif - + .L72: #ifndef TRMMKERNEL movl K, %eax @@ -1496,7 +1496,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/zgemm_ncopy_2.S b/kernel/x86/zgemm_ncopy_2.S index bc80b4734..ad5ffbe89 100644 --- a/kernel/x86/zgemm_ncopy_2.S +++ b/kernel/x86/zgemm_ncopy_2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 8 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_A 12 + STACK + ARGS(%esp) diff --git a/kernel/x86/zgemm_tcopy_2.S b/kernel/x86/zgemm_tcopy_2.S index f9a601d9b..1598e9f0c 100644 --- a/kernel/x86/zgemm_tcopy_2.S +++ b/kernel/x86/zgemm_tcopy_2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 8 - + #define J 0 + STACK(%esp) #define BOFFSET2 4 + STACK(%esp) @@ -60,7 +60,7 @@ pushl %ebx PROFCODE - + #define A %ebp #define A1 %edx #define LDA %ecx diff --git a/kernel/x86/zgemv_n.S b/kernel/x86/zgemv_n.S index a3c9174e6..4a411ccb0 100644 --- a/kernel/x86/zgemv_n.S +++ b/kernel/x86/zgemv_n.S @@ -46,14 +46,14 @@ #if defined(PENTIUM4) || defined(ATHLON) #define P ((DTB_DEFAULT_ENTRIES) >> 1) #endif - + #ifndef P #define P DTB_DEFAULT_ENTRIES #endif #define STACK 16 #define ARGS 16 - + #define PLDA_M 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_N 8 + STACK(%esp) diff --git a/kernel/x86/zgemv_n_atom.S b/kernel/x86/zgemv_n_atom.S index 3dba030f8..36e82f77d 100644 --- a/kernel/x86/zgemv_n_atom.S +++ b/kernel/x86/zgemv_n_atom.S @@ -58,7 +58,7 @@ #define Y 48 + STACKSIZE(%esp) #define STACK_INCY 52 + STACKSIZE(%esp) #define BUFFER 56 + STACKSIZE(%esp) - + #define I %eax #define J %ebx @@ -122,7 +122,7 @@ jle .L999 movl BUFFER, Y1 - + movl N, J pxor %xmm7, %xmm7 @@ -538,7 +538,7 @@ .L999: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp ret diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index b0f686a6e..7bf41bb21 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -106,7 +106,7 @@ #define MMM 0+ARGS(%esp) #define YY 4+ARGS(%esp) #define AA 8+ARGS(%esp) - + #define I %eax #define J %ebx @@ -159,7 +159,7 @@ .L00t: movl AA,%eax movl %eax,A - + movl YY,J movl J,Y @@ -178,7 +178,7 @@ jle .L999 movl BUFFER, Y1 - + movl N, J xorps %xmm7, %xmm7 @@ -640,7 +640,7 @@ .L999x: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp ret diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index bb33d2615..fd01e2a16 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -93,7 +93,7 @@ #define YY 4 + ARGS(%esp) #define AA 8 + ARGS(%esp) - + #define I %eax #define J %ebx @@ -165,7 +165,7 @@ jle .L999 movl BUFFER, Y1 - + movl N, J pxor %xmm7, %xmm7 @@ -202,7 +202,7 @@ pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 - shufps $0xc0, %xmm5, %xmm5 + shufps $0xc0, %xmm5, %xmm5 pshufd $0x4e, %xmm6, %xmm7 @@ -503,7 +503,7 @@ .L999x: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp ret diff --git a/kernel/x86/zgemv_t.S b/kernel/x86/zgemv_t.S index 452794cc1..83b602d8e 100644 --- a/kernel/x86/zgemv_t.S +++ b/kernel/x86/zgemv_t.S @@ -49,7 +49,7 @@ #define STACK 16 #define ARGS 24 - + #define NLDA 0 + STACK(%esp) #define XP 4 + STACK(%esp) #define MIN_M 8 + STACK(%esp) diff --git a/kernel/x86/zgemv_t_atom.S b/kernel/x86/zgemv_t_atom.S index 6f0dee0b6..444f9ac29 100644 --- a/kernel/x86/zgemv_t_atom.S +++ b/kernel/x86/zgemv_t_atom.S @@ -58,7 +58,7 @@ #define Y 48 + STACKSIZE(%esp) #define STACK_INCY 52 + STACKSIZE(%esp) #define BUFFER 56 + STACKSIZE(%esp) - + #define I %eax #define J %ebx @@ -124,7 +124,7 @@ jle .L999 movl BUFFER, Y1 - + movl M, I sarl $2, I jle .L05 @@ -180,7 +180,7 @@ movl N, J ALIGN_3 -.L11: +.L11: movl BUFFER, X addl $16 * SIZE, X @@ -434,11 +434,11 @@ decl J jg .L11 ALIGN_4 - + .L999: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp ret diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index a7a7abd48..fc955e2c7 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -106,7 +106,7 @@ #define MMM 0+ARGS(%esp) #define XX 4+ARGS(%esp) #define AA 8+ARGS(%esp) - + #define I %eax #define J %ebx @@ -180,7 +180,7 @@ jle .L999 movl BUFFER, Y1 - + movl M, I sarl $2, I jle .L05 @@ -239,7 +239,7 @@ movl N, J ALIGN_3 -.L11: +.L11: movl BUFFER, X addl $32 * SIZE, X @@ -473,7 +473,7 @@ mulps %xmm2, %xmm5 SUBPS %xmm5, %xmm1 ALIGN_4 - + .L19: #ifdef HAVE_SSE2 pcmpeqb %xmm5, %xmm5 @@ -486,7 +486,7 @@ addl $8, %esp movlhps %xmm5, %xmm5 #endif - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorps %xmm5, %xmm0 #else @@ -529,7 +529,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -544,7 +544,7 @@ decl J jg .L11 ALIGN_4 - + .L999: movl M,%eax sall $ZBASE_SHIFT, %eax @@ -558,7 +558,7 @@ .L999x: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index 86f5976b9..b58f698b4 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -167,7 +167,7 @@ jle .L999 movl BUFFER, Y1 - + movl M, I sarl $2, I jle .L05 @@ -223,7 +223,7 @@ movl N, J ALIGN_4 -.L11: +.L11: movl BUFFER, X addl $16 * SIZE, X @@ -377,7 +377,7 @@ pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 shufps $0xc0, %xmm5, %xmm5 - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm5, %xmm0 #else @@ -426,7 +426,7 @@ decl J jg .L11 ALIGN_4 - + .L999: movl M,%eax sall $ZBASE_SHIFT,%eax @@ -440,7 +440,7 @@ .L999x: popl %ebx popl %esi - popl %edi + popl %edi popl %ebp addl $ARGS,%esp ret diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S index c645b57ef..263612e9a 100644 --- a/kernel/x86/znrm2.S +++ b/kernel/x86/znrm2.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -49,7 +49,7 @@ #define M %edx #define X %ecx #define INCX %esi - + #define I %eax #include "l1param.h" @@ -91,7 +91,7 @@ sarl $2, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -156,7 +156,7 @@ sarl $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) fmul %st(0), %st diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S index 95ca9fda4..bbc3677ae 100644 --- a/kernel/x86/znrm2_sse.S +++ b/kernel/x86/znrm2_sse.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 8 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -49,7 +49,7 @@ #define M %edx #define X %ecx #define INCX %esi - + #define I %eax #include "l1param.h" @@ -82,7 +82,7 @@ testl $SIZE, X je .L05 - + movss -32 * SIZE(X), %xmm0 cvtss2sd %xmm0, %xmm0 mulsd %xmm0, %xmm0 @@ -96,7 +96,7 @@ movl M, I sarl $4, I jle .L13 - + movsd -32 * SIZE(X), %xmm4 movsd -30 * SIZE(X), %xmm5 movsd -28 * SIZE(X), %xmm6 @@ -269,7 +269,7 @@ movl M, I sarl $3, I jle .L43 - + movsd (X), %xmm4 addl INCX, X movsd (X), %xmm5 diff --git a/kernel/x86/zrot.S b/kernel/x86/zrot.S index 7ac984e87..93f86c8ab 100644 --- a/kernel/x86/zrot.S +++ b/kernel/x86/zrot.S @@ -38,10 +38,10 @@ #define ASSEMBLER #include "common.h" - + #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) @@ -110,7 +110,7 @@ sarl $1, I jle .L15 ALIGN_4 - + .L10: #ifdef PENTIUM4 PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) @@ -261,7 +261,7 @@ sarl $1, I jle .L55 ALIGN_4 - + .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) diff --git a/kernel/x86/zrot_sse.S b/kernel/x86/zrot_sse.S index d10183f73..9c2fa4f1e 100644 --- a/kernel/x86/zrot_sse.S +++ b/kernel/x86/zrot_sse.S @@ -1285,12 +1285,12 @@ .L50: movl N, I -//if incx ==0 || incy==0 jump to the tail +//if incx ==0 || incy==0 jump to the tail cmpl $0, INCX je .L56 cmpl $0, INCY je .L56 - + sarl $2, I jle .L55 ALIGN_3 diff --git a/kernel/x86/zrot_sse2.S b/kernel/x86/zrot_sse2.S index 7787f4549..0bab35124 100644 --- a/kernel/x86/zrot_sse2.S +++ b/kernel/x86/zrot_sse2.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S index 7505cea1a..1eb518563 100644 --- a/kernel/x86/zscal.S +++ b/kernel/x86/zscal.S @@ -40,7 +40,7 @@ #include "common.h" #define STACK 8 - + #define STACK_N 4 + STACK(%esp) #ifdef XDOUBLE #define ALPHA_R 16 + STACK(%esp) diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index 53abb697b..e011c98f5 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -1073,7 +1073,7 @@ #else - + PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) subps %xmm1, %xmm7 diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index 26ef693a0..cc7ab6686 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -73,7 +73,7 @@ #define xmm14 xmm6 #define xmm15 xmm7 - + PROLOGUE PROFCODE @@ -94,7 +94,7 @@ testl M, M jle .L999 - + xorps %xmm7, %xmm7 comisd %xmm0, %xmm7 jne .L100 @@ -193,7 +193,7 @@ jle .L22 ALIGN_4 -.L21: +.L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif @@ -294,10 +294,10 @@ #else pshufd $0x44, %xmm0, %xmm6 #endif - + xorps %xmm7, %xmm7 subsd %xmm1, %xmm7 - movlhps %xmm1, %xmm7 + movlhps %xmm1, %xmm7 cmpl $2 * SIZE, INCX jne .L120 @@ -869,7 +869,7 @@ #endif pxor %xmm7, %xmm7 subsd %xmm1, %xmm7 - movlhps %xmm1, %xmm7 + movlhps %xmm1, %xmm7 shufpd $1, %xmm7, %xmm7 movhps 0 * SIZE(X), %xmm0 @@ -1150,7 +1150,7 @@ #endif pxor %xmm7, %xmm7 subsd %xmm1, %xmm7 - movlhps %xmm1, %xmm7 + movlhps %xmm1, %xmm7 subl $-16 * SIZE, X @@ -1427,7 +1427,7 @@ #endif pxor %xmm7, %xmm7 subsd %xmm1, %xmm7 - movlhps %xmm1, %xmm7 + movlhps %xmm1, %xmm7 movl X, XX diff --git a/kernel/x86/zswap.S b/kernel/x86/zswap.S index ca4660f44..620a00928 100644 --- a/kernel/x86/zswap.S +++ b/kernel/x86/zswap.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define N 4 + STACK + ARGS(%esp) #ifdef XDOUBLE #define X 48 + STACK + ARGS(%esp) diff --git a/kernel/x86/zswap_sse.S b/kernel/x86/zswap_sse.S index 24d000166..479d9461a 100644 --- a/kernel/x86/zswap_sse.S +++ b/kernel/x86/zswap_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 24 + STACK + ARGS(%esp) #define STACK_INCX 28 + STACK + ARGS(%esp) @@ -85,7 +85,7 @@ subl $-32 * SIZE, X subl $-32 * SIZE, Y - + cmpl $3, M jle .L16 @@ -307,7 +307,7 @@ .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) PSHUFD2($0x39, %xmm1, %xmm3) movlps %xmm3, -31 * SIZE(X) @@ -783,7 +783,7 @@ .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) subl $3, M diff --git a/kernel/x86/zswap_sse2.S b/kernel/x86/zswap_sse2.S index d900ea547..cc012b32b 100644 --- a/kernel/x86/zswap_sse2.S +++ b/kernel/x86/zswap_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esp) #define STACK_X 32 + STACK + ARGS(%esp) #define STACK_INCX 36 + STACK + ARGS(%esp) diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_core2.S b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S index 1d3107a41..1a6f8c0a0 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x1_core2.S +++ b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S @@ -47,7 +47,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -166,7 +166,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -185,7 +185,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -201,7 +201,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -312,7 +312,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -600,7 +600,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -744,7 +744,7 @@ decl %eax jne .L12 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S index 7aef33696..029a2f5d5 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S +++ b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S @@ -47,7 +47,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -276,7 +276,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -295,7 +295,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -311,7 +311,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -357,7 +357,7 @@ movapd %xmm7, 14 * SIZE(BB) prefetcht0 104 * SIZE(B) - + addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax @@ -436,7 +436,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 @@ -713,7 +713,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -740,7 +740,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -837,7 +837,7 @@ .L11: leal (BB, %eax, 4), BB leal (AA, %eax, 4), AA - + .L12: #if defined(LT) || defined(RN) movl KK, %eax diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 6d9880556..da561b583 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -55,7 +55,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 @@ -101,12 +101,12 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK movl M, %ebx testl %ebx, %ebx - jle .L999 + jle .L999 subl $-32 * SIZE, A subl $-32 * SIZE, B @@ -134,7 +134,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -177,7 +177,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -207,7 +207,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -365,7 +365,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #if defined(LT) || defined(RN) movl KK, %eax @@ -438,7 +438,7 @@ pxor %xmm0, %xmm7 #endif #endif - + addps %xmm5, %xmm4 addps %xmm7, %xmm6 @@ -662,7 +662,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -821,11 +821,11 @@ movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 - + decl %eax jne .L11 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax @@ -1258,7 +1258,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1288,7 +1288,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1387,11 +1387,11 @@ subl $-16 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L141 ALIGN_4 - + .L142: #if defined(LT) || defined(RN) movl KK, %eax @@ -1449,7 +1449,7 @@ pxor %xmm0, %xmm5 #endif #endif - + addps %xmm5, %xmm4 #if defined(LN) || defined(LT) @@ -1570,7 +1570,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1671,11 +1671,11 @@ subl $-32 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L111 ALIGN_4 - + .L112: #if defined(LT) || defined(RN) movl KK, %eax @@ -1737,7 +1737,7 @@ pxor %xmm0, %xmm5 #endif #endif - + addps %xmm5, %xmm4 #if defined(LN) || defined(LT) diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index b3978136a..61ce10d7a 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -231,7 +231,7 @@ addl $STACK_OFFSET, %esp STACK_TOUCHING - + movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx @@ -295,7 +295,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -314,7 +314,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -330,7 +330,7 @@ sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -454,7 +454,7 @@ movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -588,7 +588,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #if defined(LT) || defined(RN) movl KK, %eax @@ -903,7 +903,7 @@ movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -942,7 +942,7 @@ decl %eax jne .L11 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax @@ -1373,7 +1373,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1389,7 +1389,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -1510,7 +1510,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif #ifdef movsd xorps %xmm0, %xmm0 @@ -1600,7 +1600,7 @@ decl %eax jne .L141 ALIGN_4 - + .L142: #if defined(LT) || defined(RN) movl KK, %eax @@ -1797,7 +1797,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1884,7 +1884,7 @@ decl %eax jne .L111 ALIGN_4 - + .L112: #if defined(LT) || defined(RN) movl KK, %eax diff --git a/kernel/x86/ztrsm_kernel_LN_4x1_sse.S b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S index 877a3ba4f..15a53f55b 100644 --- a/kernel/x86/ztrsm_kernel_LN_4x1_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -160,7 +160,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -178,7 +178,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -194,7 +194,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -317,7 +317,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -405,7 +405,7 @@ decl %eax jne .L71 ALIGN_2 - + .L72: #if defined(LT) || defined(RN) movl KK, %eax @@ -576,7 +576,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -652,7 +652,7 @@ decl %eax jne .L51 ALIGN_4 - + .L52: #if defined(LT) || defined(RN) movl KK, %eax @@ -990,7 +990,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1171,7 +1171,7 @@ addl $64 * SIZE, AA decl %eax jne .L11 - + .L12: #if defined(LT) || defined(RN) movl KK, %eax @@ -1852,7 +1852,7 @@ decl %ebx # i -- jg .L10 ALIGN_2 - + .L99: #ifdef LN movl K, %eax @@ -1881,7 +1881,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/ztrsm_kernel_LT_1x1.S b/kernel/x86/ztrsm_kernel_LT_1x1.S index 5b13a54b8..c09380633 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x1.S +++ b/kernel/x86/ztrsm_kernel_LT_1x1.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define KK 0 + STACK(%esp) #define KKK 4 + STACK(%esp) #define AORIG 8 + STACK(%esp) @@ -112,7 +112,7 @@ movl OFFSET, %eax negl %eax movl %eax, KK -#endif +#endif #ifdef RT movl STACK_N, %eax @@ -154,7 +154,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -178,7 +178,7 @@ addl %eax, B #else movl STACK_B, B -#endif +#endif fldz fldz diff --git a/kernel/x86/ztrsm_kernel_LT_1x1_atom.S b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S index bc0d03e94..e2a527861 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x1_atom.S +++ b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -57,7 +57,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 8 + 3) @@ -101,7 +101,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK sall $ZBASE_SHIFT, LDC @@ -163,7 +163,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -195,7 +195,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd 0 * SIZE(AA), %xmm0 xorps %xmm2, %xmm2 @@ -416,7 +416,7 @@ decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 452e3bf87..a11b0286a 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -57,7 +57,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 @@ -98,12 +98,12 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK movl M, %ebx testl %ebx, %ebx - jle .L999 + jle .L999 subl $-16 * SIZE, A subl $-16 * SIZE, B @@ -169,7 +169,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -199,7 +199,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -598,7 +598,7 @@ decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN @@ -653,7 +653,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -683,7 +683,7 @@ L110: movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -955,7 +955,7 @@ L118: #ifdef RT subl $1, KK #endif - ALIGN_4 + ALIGN_4 .L999: popl %ebx diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S index fdeecc793..dfa5a55cb 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -227,7 +227,7 @@ addl $STACK_OFFSET, %esp STACK_TOUCHING - + movl STACK_M, %ebx movl STACK_N, %eax movl STACK_K, %ecx @@ -279,7 +279,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -298,7 +298,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -314,7 +314,7 @@ sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -443,7 +443,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -473,7 +473,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -851,7 +851,7 @@ decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN @@ -890,7 +890,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -906,7 +906,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -1005,7 +1005,7 @@ #endif movl M, %ebx - testl %ebx, %ebx + testl %ebx, %ebx jle .L199 ALIGN_4 @@ -1031,7 +1031,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -1289,7 +1289,7 @@ decl %ebx # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L199: #ifdef LN @@ -1318,7 +1318,7 @@ movl OLD_STACK, %esp EMMS - + popl %ebx popl %esi popl %edi diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S index 29103bad2..9ab1b9d5d 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -57,7 +57,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #ifdef PENTIUM4 #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 @@ -218,7 +218,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK sall $ZBASE_SHIFT, LDC @@ -282,7 +282,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -314,7 +314,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -606,7 +606,7 @@ decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN @@ -661,7 +661,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -669,7 +669,7 @@ #endif movl M, %ebx - testl %ebx, %ebx + testl %ebx, %ebx jle .L500 ALIGN_4 @@ -693,7 +693,7 @@ L110: movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -951,7 +951,7 @@ L114: #ifdef RT subl $1, KK #endif - ALIGN_4 + ALIGN_4 .L500: popl %ebx diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_core2.S b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S index 467465430..d971aeb90 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x1_core2.S +++ b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S @@ -47,7 +47,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -114,7 +114,7 @@ addl $STACK_OFFSET, %esp STACK_TOUCHING - + movd STACK_M, %mm0 movl STACK_N, %eax movd STACK_K, %mm1 @@ -166,7 +166,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -185,7 +185,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal 16 * SIZE + BUFFER, BB @@ -201,7 +201,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -314,7 +314,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -458,7 +458,7 @@ decl %eax jne .L12 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax @@ -757,7 +757,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd -16 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S index 77f30264d..9c25dc0ed 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S +++ b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S @@ -47,7 +47,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -276,7 +276,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -295,7 +295,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -311,7 +311,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -357,7 +357,7 @@ movapd %xmm7, 14 * SIZE(BB) prefetcht0 104 * SIZE(B) - + addl $ 8 * SIZE, B addl $16 * SIZE, BB decl %eax @@ -438,7 +438,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(BB), %xmm2 pxor %xmm4, %xmm4 @@ -465,7 +465,7 @@ NOBRANCH je .L12 sall $3, %eax - + .L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) @@ -562,7 +562,7 @@ .L11: leal (BB, %eax, 4), BB leal (AA, %eax, 4), AA - + .L12: #if defined(LT) || defined(RN) movl KK, %eax @@ -876,7 +876,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(BB), %xmm1 pxor %xmm4, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 64232fdfb..787ab5982 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -55,7 +55,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 @@ -101,12 +101,12 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK movl M, %ebx testl %ebx, %ebx - jle .L999 + jle .L999 subl $-32 * SIZE, A subl $-32 * SIZE, B @@ -134,7 +134,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -177,7 +177,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -209,7 +209,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -368,11 +368,11 @@ movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 - + decl %eax jne .L11 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax @@ -775,7 +775,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -933,7 +933,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #if defined(LT) || defined(RN) movl KK, %eax @@ -1006,7 +1006,7 @@ pxor %xmm0, %xmm7 #endif #endif - + addps %xmm5, %xmm4 addps %xmm7, %xmm6 @@ -1258,7 +1258,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -1290,7 +1290,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1391,11 +1391,11 @@ subl $-32 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L111 ALIGN_4 - + .L112: #if defined(LT) || defined(RN) movl KK, %eax @@ -1457,7 +1457,7 @@ pxor %xmm0, %xmm5 #endif #endif - + addps %xmm5, %xmm4 #if defined(LN) || defined(LT) @@ -1678,7 +1678,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1777,11 +1777,11 @@ subl $-16 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L141 ALIGN_4 - + .L142: #if defined(LT) || defined(RN) movl KK, %eax @@ -1839,7 +1839,7 @@ pxor %xmm0, %xmm5 #endif #endif - + addps %xmm5, %xmm4 #if defined(LN) || defined(LT) diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index 5ff93930c..a4c2ab787 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -295,7 +295,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -314,7 +314,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -330,7 +330,7 @@ sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -454,7 +454,7 @@ movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -493,7 +493,7 @@ decl %eax jne .L11 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax @@ -915,7 +915,7 @@ movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1049,7 +1049,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #if defined(LT) || defined(RN) movl KK, %eax @@ -1373,7 +1373,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1389,7 +1389,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -1512,7 +1512,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1599,7 +1599,7 @@ decl %eax jne .L111 ALIGN_4 - + .L112: #if defined(LT) || defined(RN) movl KK, %eax @@ -1906,7 +1906,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif #ifdef movsd xorps %xmm0, %xmm0 @@ -1996,7 +1996,7 @@ decl %eax jne .L141 ALIGN_4 - + .L142: #if defined(LT) || defined(RN) movl KK, %eax diff --git a/kernel/x86/ztrsm_kernel_LT_4x1_sse.S b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S index 4f324bced..57b2133b9 100644 --- a/kernel/x86/ztrsm_kernel_LT_4x1_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S @@ -45,7 +45,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -160,7 +160,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -178,7 +178,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -194,7 +194,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -319,7 +319,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -500,7 +500,7 @@ addl $64 * SIZE, AA decl %eax jne .L11 - + .L12: #if defined(LT) || defined(RN) movl KK, %eax @@ -1184,7 +1184,7 @@ decl %ebx # i -- jg .L10 ALIGN_2 - + .L50: movl M, %ebx testl $2, %ebx @@ -1211,7 +1211,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 xorps %xmm4, %xmm4 @@ -1287,7 +1287,7 @@ decl %eax jne .L51 ALIGN_4 - + .L52: #if defined(LT) || defined(RN) movl KK, %eax @@ -1623,7 +1623,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(BB), %xmm2 @@ -1713,7 +1713,7 @@ decl %eax jne .L71 ALIGN_2 - + .L72: #if defined(LT) || defined(RN) movl KK, %eax @@ -1886,7 +1886,7 @@ .L999: movl OLD_STACK, %esp - + EMMS popl %ebx diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 71246d7a6..9a3b0cbd7 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -57,7 +57,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 @@ -98,12 +98,12 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK movl M, %ebx testl %ebx, %ebx - jle .L999 + jle .L999 subl $-16 * SIZE, A subl $-16 * SIZE, B @@ -164,7 +164,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -194,7 +194,7 @@ L110: movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -466,7 +466,7 @@ L118: #ifdef RT subl $1, KK #endif - ALIGN_4 + ALIGN_4 .L100: movl N, %eax @@ -503,7 +503,7 @@ L118: movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -533,7 +533,7 @@ L118: movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -16 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -932,7 +932,7 @@ L118: decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S index 882486813..108d4beee 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -277,7 +277,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -295,7 +295,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -311,7 +311,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -410,7 +410,7 @@ #endif movl M, %ebx - testl %ebx, %ebx + testl %ebx, %ebx jle .L199 ALIGN_4 @@ -436,7 +436,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 @@ -694,7 +694,7 @@ decl %ebx # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L199: #ifdef LN @@ -731,7 +731,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, BB @@ -747,7 +747,7 @@ sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 2), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -876,7 +876,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax leal (BB, %eax, 2), BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -906,7 +906,7 @@ andl $-8, %eax sall $4, %eax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -1284,7 +1284,7 @@ decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S index 8b7bf6bf7..7f7e4d33a 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -57,7 +57,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #ifdef PENTIUM4 #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 @@ -218,7 +218,7 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK sall $ZBASE_SHIFT, LDC @@ -277,7 +277,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -285,7 +285,7 @@ #endif movl M, %ebx - testl %ebx, %ebx + testl %ebx, %ebx jle .L500 ALIGN_4 @@ -309,7 +309,7 @@ L110: movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -567,7 +567,7 @@ L114: #ifdef RT subl $1, KK #endif - ALIGN_4 + ALIGN_4 .L100: movl N, %eax @@ -604,7 +604,7 @@ L114: movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -636,7 +636,7 @@ L114: movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movapd 0 * SIZE(AA), %xmm0 pxor %xmm4, %xmm4 @@ -928,7 +928,7 @@ L114: decl %ebx # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index dfd555c88..bd7a78b5a 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 16 - + #define M 4 + STACK + ARGS(%esp) #define N 8 + STACK + ARGS(%esp) #define K 12 + STACK + ARGS(%esp) @@ -55,7 +55,7 @@ #define KK 4 + STACK(%esp) #define KKK 8 + STACK(%esp) #define AORIG 12 + STACK(%esp) - + #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 @@ -101,12 +101,12 @@ movl OFFSET, %eax #ifdef RN negl %eax -#endif +#endif movl %eax, KK movl M, %ebx testl %ebx, %ebx - jle .L999 + jle .L999 subl $-32 * SIZE, A subl $-32 * SIZE, B @@ -134,7 +134,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -172,7 +172,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -204,7 +204,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -305,11 +305,11 @@ subl $-32 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L111 ALIGN_4 - + .L112: #if defined(LT) || defined(RN) movl KK, %eax @@ -371,7 +371,7 @@ pxor %xmm0, %xmm5 #endif #endif - + addps %xmm5, %xmm4 #if defined(LN) || defined(LT) @@ -592,7 +592,7 @@ movl KK, %eax sall $ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -691,11 +691,11 @@ subl $-16 * SIZE, AA subl $-16 * SIZE, BB - + decl %eax jne .L141 ALIGN_4 - + .L142: #if defined(LT) || defined(RN) movl KK, %eax @@ -753,7 +753,7 @@ pxor %xmm0, %xmm5 #endif #endif - + addps %xmm5, %xmm4 #if defined(LN) || defined(LT) @@ -904,7 +904,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif #ifdef LT movl OFFSET, %eax @@ -936,7 +936,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1095,11 +1095,11 @@ movaps -32 * SIZE(BB), %xmm1 mulps %xmm0, %xmm2 movaps -32 * SIZE(AA), %xmm0 - + decl %eax jne .L11 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax @@ -1502,7 +1502,7 @@ movl KK, %eax sall $1 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movsd -32 * SIZE(AA), %xmm0 pxor %xmm2, %xmm2 @@ -1660,7 +1660,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #if defined(LT) || defined(RN) movl KK, %eax @@ -1733,7 +1733,7 @@ pxor %xmm0, %xmm7 #endif #endif - + addps %xmm5, %xmm4 addps %xmm7, %xmm6 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index 92492521b..5cd0dd5f2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -41,7 +41,7 @@ #define STACK 16 #define ARGS 0 - + #define STACK_M 4 + STACK + ARGS(%esi) #define STACK_N 8 + STACK + ARGS(%esi) #define STACK_K 12 + STACK + ARGS(%esi) @@ -295,7 +295,7 @@ #ifdef RN negl KK -#endif +#endif #ifdef RT movl N, %eax @@ -313,7 +313,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -329,7 +329,7 @@ sall $ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -452,7 +452,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -539,7 +539,7 @@ decl %eax jne .L111 ALIGN_4 - + .L112: #if defined(LT) || defined(RN) movl KK, %eax @@ -846,7 +846,7 @@ movl KK, %eax sall $2 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif #ifdef movsd xorps %xmm0, %xmm0 @@ -936,7 +936,7 @@ decl %eax jne .L141 ALIGN_4 - + .L142: #if defined(LT) || defined(RN) movl KK, %eax @@ -1140,7 +1140,7 @@ movl OFFSET, %eax addl M, %eax movl %eax, KK -#endif +#endif leal BUFFER, %ecx @@ -1156,7 +1156,7 @@ sall $1 + ZBASE_SHIFT, %eax addl %eax, B leal (BB, %eax, 4), BB -#endif +#endif #if defined(LT) movl OFFSET, %eax @@ -1280,7 +1280,7 @@ movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif movaps 0 * SIZE(AA), %xmm0 xorps %xmm4, %xmm4 @@ -1319,7 +1319,7 @@ decl %eax jne .L11 ALIGN_4 - + .L15: #if defined(LT) || defined(RN) movl KK, %eax @@ -1741,7 +1741,7 @@ movl KK, %eax sall $3 + ZBASE_SHIFT, %eax addl %eax, BB -#endif +#endif xorps %xmm4, %xmm4 xorps %xmm5, %xmm5 @@ -1875,7 +1875,7 @@ decl %eax jne .L41 ALIGN_4 - + .L42: #if defined(LT) || defined(RN) movl KK, %eax diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index e6a97152e..d3486283e 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -119,7 +119,7 @@ XCOPYKERNEL = zcopy.S endif ifndef SDOTKERNEL -SDOTKERNEL = ../arm/dot.c +SDOTKERNEL = ../arm/dot.c endif ifndef DDOTKERNEL diff --git a/kernel/x86_64/KERNEL.ATOM b/kernel/x86_64/KERNEL.ATOM index cfbd05a62..c24848341 100644 --- a/kernel/x86_64/KERNEL.ATOM +++ b/kernel/x86_64/KERNEL.ATOM @@ -29,8 +29,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x2_atom.S diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA index aac21d4ec..b1e099e4c 100644 --- a/kernel/x86_64/KERNEL.BARCELONA +++ b/kernel/x86_64/KERNEL.BARCELONA @@ -9,8 +9,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.BOBCAT b/kernel/x86_64/KERNEL.BOBCAT index 051a52286..2b6b2fe59 100644 --- a/kernel/x86_64/KERNEL.BOBCAT +++ b/kernel/x86_64/KERNEL.BOBCAT @@ -6,8 +6,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 7a74c38d1..f8ac3db0e 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -16,8 +16,8 @@ SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S diff --git a/kernel/x86_64/KERNEL.CORE2 b/kernel/x86_64/KERNEL.CORE2 index 8a07e8084..867c94128 100644 --- a/kernel/x86_64/KERNEL.CORE2 +++ b/kernel/x86_64/KERNEL.CORE2 @@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_core2.S diff --git a/kernel/x86_64/KERNEL.DUNNINGTON b/kernel/x86_64/KERNEL.DUNNINGTON index b96daa03f..8c2a23c29 100644 --- a/kernel/x86_64/KERNEL.DUNNINGTON +++ b/kernel/x86_64/KERNEL.DUNNINGTON @@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_penryn.S diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 417c6e73f..2f629de2a 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -13,12 +13,12 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_4x4_haswell.S -DGEMMINCOPY = -DGEMMITCOPY = +DGEMMINCOPY = +DGEMMITCOPY = DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/KERNEL.NANO b/kernel/x86_64/KERNEL.NANO index 0b771a451..e30bd2b33 100644 --- a/kernel/x86_64/KERNEL.NANO +++ b/kernel/x86_64/KERNEL.NANO @@ -3,8 +3,8 @@ SGEMMINCOPY = gemm_ncopy_4.S SGEMMITCOPY = gemm_tcopy_4.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_penryn.S diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM index 736e41940..e3898b0bc 100644 --- a/kernel/x86_64/KERNEL.NEHALEM +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -7,8 +7,8 @@ SGEMMINCOPY = gemm_ncopy_4.S SGEMMITCOPY = gemm_tcopy_4.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/KERNEL.OPTERON b/kernel/x86_64/KERNEL.OPTERON index 27fb78598..d917c2744 100644 --- a/kernel/x86_64/KERNEL.OPTERON +++ b/kernel/x86_64/KERNEL.OPTERON @@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_sse2.S diff --git a/kernel/x86_64/KERNEL.OPTERON_SSE3 b/kernel/x86_64/KERNEL.OPTERON_SSE3 index 565daf366..9367bd62a 100644 --- a/kernel/x86_64/KERNEL.OPTERON_SSE3 +++ b/kernel/x86_64/KERNEL.OPTERON_SSE3 @@ -6,8 +6,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4_opteron.S SGEMMOTCOPY = gemm_tcopy_4_opteron.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_sse2.S diff --git a/kernel/x86_64/KERNEL.PENRYN b/kernel/x86_64/KERNEL.PENRYN index b96daa03f..8c2a23c29 100644 --- a/kernel/x86_64/KERNEL.PENRYN +++ b/kernel/x86_64/KERNEL.PENRYN @@ -3,8 +3,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_penryn.S diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 0f5b52616..a06a04cfa 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -15,8 +15,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMONCOPY = gemm_ncopy_2_bulldozer.S SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/KERNEL.PRESCOTT b/kernel/x86_64/KERNEL.PRESCOTT index e15553190..03d6664e8 100644 --- a/kernel/x86_64/KERNEL.PRESCOTT +++ b/kernel/x86_64/KERNEL.PRESCOTT @@ -6,8 +6,8 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_4x4_sse3.S diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 71055cfca..1b4228c84 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -33,12 +33,12 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S -ZGEMMINCOPY = -ZGEMMITCOPY = +ZGEMMINCOPY = +ZGEMMITCOPY = ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -ZGEMMINCOPYOBJ = -ZGEMMITCOPYOBJ = +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S index d096d883c..0e9bf4db4 100644 --- a/kernel/x86_64/amax.S +++ b/kernel/x86_64/amax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 #define X ARG2 #define INCX ARG3 @@ -68,7 +68,7 @@ FLD (X) #ifdef USE_ABS - fabs + fabs #endif addq INCX, X decq M @@ -81,7 +81,7 @@ sarq $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -89,7 +89,7 @@ FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -98,7 +98,7 @@ FLD 1 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -107,7 +107,7 @@ FLD 2 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -116,7 +116,7 @@ FLD 3 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -125,7 +125,7 @@ FLD 4 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -134,7 +134,7 @@ FLD 5 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -143,7 +143,7 @@ FLD 6 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -152,7 +152,7 @@ FLD 7 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -175,7 +175,7 @@ .L21: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -193,12 +193,12 @@ sarq $3, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -208,7 +208,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -218,7 +218,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -228,7 +228,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -238,7 +238,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -248,7 +248,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -258,7 +258,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -268,7 +268,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -289,7 +289,7 @@ .L61: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) diff --git a/kernel/x86_64/amax_atom.S b/kernel/x86_64/amax_atom.S index fa7b9a366..6164cb3e3 100644 --- a/kernel/x86_64/amax_atom.S +++ b/kernel/x86_64/amax_atom.S @@ -38,13 +38,13 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #ifdef USE_MIN #define maxsd minsd #endif @@ -103,7 +103,7 @@ decq I jle .L13 ALIGN_4 - + .L12: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -227,7 +227,7 @@ maxsd %xmm7, %xmm3 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L17: testq $2, M @@ -246,7 +246,7 @@ maxsd %xmm5, %xmm2 addq $2 * SIZE, X ALIGN_3 - + .L18: testq $1, M jle .L998 @@ -284,7 +284,7 @@ decq I jle .L23 ALIGN_4 - + .L22: #ifdef USE_ABS andps %xmm15, %xmm4 @@ -412,7 +412,7 @@ andps %xmm15, %xmm7 #endif maxsd %xmm7, %xmm3 - ALIGN_3 + ALIGN_3 .L27: testq $2, M @@ -432,7 +432,7 @@ #endif maxsd %xmm5, %xmm2 ALIGN_3 - + .L28: testq $1, M jle .L998 diff --git a/kernel/x86_64/amax_sse.S b/kernel/x86_64/amax_sse.S index 22b8b16d2..2349905d3 100644 --- a/kernel/x86_64/amax_sse.S +++ b/kernel/x86_64/amax_sse.S @@ -38,18 +38,18 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #ifdef USE_MIN #define maxps minps #define maxss minss #endif - + #include "l1param.h" PROLOGUE @@ -126,7 +126,7 @@ decq I jle .L12 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -297,7 +297,7 @@ #endif maxps %xmm4, %xmm2 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L18: testq $2, M @@ -311,7 +311,7 @@ maxps %xmm4, %xmm3 addq $2 * SIZE, X ALIGN_3 - + .L19: testq $1, M je .L998 @@ -329,7 +329,7 @@ sarq $3, I jle .L45 ALIGN_4 - + .L41: movss (X), %xmm4 addq INCX, X @@ -422,7 +422,7 @@ andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 - ALIGN_3 + ALIGN_3 .L46: testq $2, M @@ -442,7 +442,7 @@ #endif maxss %xmm5, %xmm1 ALIGN_3 - + .L47: testq $1, M je .L998 diff --git a/kernel/x86_64/amax_sse2.S b/kernel/x86_64/amax_sse2.S index 033e8e176..44ddaba77 100644 --- a/kernel/x86_64/amax_sse2.S +++ b/kernel/x86_64/amax_sse2.S @@ -38,13 +38,13 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd @@ -112,7 +112,7 @@ decq I jle .L12 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -270,7 +270,7 @@ maxpd %xmm5, %xmm1 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L17: testq $2, M @@ -282,8 +282,8 @@ #endif maxpd %xmm4, %xmm2 addq $2 * SIZE, X - ALIGN_3 - + ALIGN_3 + .L18: testq $1, M jle .L998 @@ -302,7 +302,7 @@ sarq $4, I jle .L45 ALIGN_4 - + .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -453,7 +453,7 @@ andps %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 - ALIGN_3 + ALIGN_3 .L47: testq $2, M @@ -468,7 +468,7 @@ #endif maxpd %xmm6, %xmm2 ALIGN_3 - + .L48: testq $1, M je .L998 diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S index 13c6f4fa2..31f973894 100644 --- a/kernel/x86_64/asum.S +++ b/kernel/x86_64/asum.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 #define X ARG2 #define INCX ARG3 @@ -68,7 +68,7 @@ sarq $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -128,7 +128,7 @@ sarq $3, I jle .L60 ALIGN_4 - + .L50: FLD (X) addq INCX, X diff --git a/kernel/x86_64/asum_atom.S b/kernel/x86_64/asum_atom.S index b6ea65f01..910a48f09 100644 --- a/kernel/x86_64/asum_atom.S +++ b/kernel/x86_64/asum_atom.S @@ -38,20 +38,20 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS - + xorps %xmm0, %xmm0 testq M, M jle .L999 @@ -101,7 +101,7 @@ decq I jle .L11 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -256,7 +256,7 @@ pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 addsd %xmm13, %xmm3 - ALIGN_3 + ALIGN_3 .L14: testq $2, M @@ -269,8 +269,8 @@ pshufd $0x4e, %xmm4, %xmm5 addsd %xmm4, %xmm2 addsd %xmm5, %xmm3 - ALIGN_3 - + ALIGN_3 + .L15: testq $1, M je .L998 @@ -306,7 +306,7 @@ decq I jle .L23 ALIGN_4 - + .L22: andps %xmm15, %xmm4 addq INCX, X @@ -391,7 +391,7 @@ addsd %xmm6, %xmm2 andps %xmm15, %xmm7 addsd %xmm7, %xmm3 - ALIGN_3 + ALIGN_3 .L26: testq $2, M @@ -408,7 +408,7 @@ addsd %xmm4, %xmm0 addsd %xmm5, %xmm1 ALIGN_3 - + .L27: testq $1, M je .L998 @@ -426,7 +426,7 @@ .L999: RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/asum_sse.S b/kernel/x86_64/asum_sse.S index 840e1939d..7d7004d48 100644 --- a/kernel/x86_64/asum_sse.S +++ b/kernel/x86_64/asum_sse.S @@ -38,20 +38,20 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS - + xorps %xmm0, %xmm0 testq M, M jle .L999 @@ -112,7 +112,7 @@ decq I jle .L12 ALIGN_3 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -259,7 +259,7 @@ sarq $3, I jle .L105 ALIGN_4 - + .L101: movss 0 * SIZE(X), %xmm4 addq INCX, X @@ -327,7 +327,7 @@ #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 @@ -339,7 +339,7 @@ .L999: RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/asum_sse2.S b/kernel/x86_64/asum_sse2.S index 7286fc093..e75ebde13 100644 --- a/kernel/x86_64/asum_sse2.S +++ b/kernel/x86_64/asum_sse2.S @@ -38,20 +38,20 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS - + xorps %xmm0, %xmm0 testq M, M jle .L999 @@ -101,7 +101,7 @@ decq I jle .L11 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -209,7 +209,7 @@ addpd %xmm5, %xmm1 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L22: testq $2, M @@ -219,7 +219,7 @@ andps %xmm15, %xmm6 addpd %xmm6, %xmm3 addq $2 * SIZE, X - + .L23: testq $1, M je .L998 @@ -238,7 +238,7 @@ sarq $3, I jle .L60 ALIGN_4 - + .L50: movsd -16 * SIZE(X), %xmm4 addq INCX, X @@ -304,7 +304,7 @@ #endif RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/axpy.S b/kernel/x86_64/axpy.S index 478cc88e8..0ad6591b0 100644 --- a/kernel/x86_64/axpy.S +++ b/kernel/x86_64/axpy.S @@ -44,14 +44,14 @@ #define INCX ARG5 /* rdx */ #define Y ARG6 /* rcx */ #define INCY ARG2 /* r8 */ - + #define ALPHA 8(%rsp) #include "l1param.h" PROLOGUE PROFCODE - + movq 24(%rsp), INCY FLD ALPHA @@ -61,7 +61,7 @@ testq M, M jle .L40 - + cmpq $SIZE, INCX jne .L14 cmpq $SIZE, INCY diff --git a/kernel/x86_64/axpy_atom.S b/kernel/x86_64/axpy_atom.S index a786329e4..adfd69164 100644 --- a/kernel/x86_64/axpy_atom.S +++ b/kernel/x86_64/axpy_atom.S @@ -84,7 +84,7 @@ testq M, M jle .L29 - + cmpq $SIZE, INCX jne .L20 cmpq $SIZE, INCY diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S index 2a9e928ed..dd52a7c83 100644 --- a/kernel/x86_64/axpy_sse.S +++ b/kernel/x86_64/axpy_sse.S @@ -69,7 +69,7 @@ #endif movaps %xmm0, ALPHA #else - + movq 40(%rsp), X movq 48(%rsp), INCX @@ -82,7 +82,7 @@ #ifdef WINDOWS_ABI movaps %xmm3, ALPHA #endif - + shufps $0, ALPHA, ALPHA leaq (, INCX, SIZE), INCX @@ -90,7 +90,7 @@ testq M, M jle .L19 - + cmpq $SIZE, INCX jne .L50 cmpq $SIZE, INCY @@ -368,7 +368,7 @@ .L20: #ifdef ALIGNED_ACCESS - + testq $SIZE, X jne .L30 diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S index 45c7b0380..9b07b9020 100644 --- a/kernel/x86_64/axpy_sse2.S +++ b/kernel/x86_64/axpy_sse2.S @@ -57,7 +57,7 @@ #define ALPHA %xmm15 #include "l1param.h" - + PROLOGUE PROFCODE @@ -89,7 +89,7 @@ testq M, M jle .L47 - + cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY @@ -813,7 +813,7 @@ je .L46 cmpq $0, INCY je .L46 - + sarq $3, %rax jle .L45 ALIGN_3 diff --git a/kernel/x86_64/builtin_stinit.S b/kernel/x86_64/builtin_stinit.S index c05a1c547..cb3a28887 100644 --- a/kernel/x86_64/builtin_stinit.S +++ b/kernel/x86_64/builtin_stinit.S @@ -53,7 +53,7 @@ cmpq $4096, %rax jg .L01 ALIGN_3 - + .L999: subq %rax, %rsp ret diff --git a/kernel/x86_64/cabs.S b/kernel/x86_64/cabs.S index 0b1a91185..7de9ca4d5 100644 --- a/kernel/x86_64/cabs.S +++ b/kernel/x86_64/cabs.S @@ -46,7 +46,7 @@ movsd 0 * SIZE(ARG1), %xmm0 movsd 1 * SIZE(ARG1), %xmm1 pcmpeqb %xmm4, %xmm4 - + psrlq $1, %xmm4 andpd %xmm4, %xmm0 andpd %xmm4, %xmm1 @@ -55,13 +55,13 @@ movss 0 * SIZE(ARG1), %xmm0 movss 1 * SIZE(ARG1), %xmm1 pcmpeqb %xmm4, %xmm4 - + psrld $1, %xmm4 andps %xmm4, %xmm0 andps %xmm4, %xmm1 addps %xmm1, %xmm0 #endif - + #if !defined(DOUBLE) && defined(NEED_F2CCONV) cvtss2sd %xmm0, %xmm0 #endif diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 5a5588089..487f95936 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -59,7 +59,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef WINDOWS_ABI -#define STACKSIZE 128 +#define STACKSIZE 128 #define old_ldc 8+STACKSIZE(%rsp) #define old_offset 16+STACKSIZE(%rsp) @@ -144,10 +144,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OR orq #define JNE jne #define JMP jmp -#define NOP +#define NOP #define XOR xorpd #undef MOVQ -#define MOVQ movq +#define MOVQ movq #define XOR_SY vxorps #define XOR_SX vxorps @@ -171,7 +171,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define EDUP_SX vmovsldup #define ODUP_SX vmovshdup -#define ADD_SY vaddps +#define ADD_SY vaddps #define ADD_SX vaddps #define SUB_SY vsubps #define SUB_SX vsubps @@ -189,7 +189,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VPERMILP_SX vpermilps #define BROAD_SY vbroadcastss -#define BROAD_SX vbroadcastss +#define BROAD_SX vbroadcastss #define MOV_SY vmovaps #define MOV_SX vmovaps @@ -214,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ADD2_SY ADDSUB_SY #define ADD1_SX SUB_SX #define ADD2_SX ADDSUB_SX -#else +#else #define ADD1_SY ADD_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX ADD_SX @@ -309,7 +309,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -# Initial results register +# Initial results register PREFETCH0 0*SIZE(prebb); XOR_SY yvec15, yvec15, yvec15; PREFETCH0 16*SIZE(prebb); @@ -338,7 +338,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -366,7 +366,7 @@ VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; -MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; @@ -420,7 +420,7 @@ VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; -MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; @@ -474,7 +474,7 @@ VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; -MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; @@ -530,7 +530,7 @@ ADDQ $32*SIZE, ptrbb; ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; -MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; @@ -573,7 +573,7 @@ ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; -#else +#else TEST $2, kkk; #endif JLE .L3_loopE; @@ -595,7 +595,7 @@ ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 -MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec11, yvec11; @@ -650,7 +650,7 @@ ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 -MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $16*SIZE, ptrbb; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 @@ -714,7 +714,7 @@ ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 -MUL_SY yvec0, yvec4, yvec6; +MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $8*SIZE, ptrbb; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 @@ -903,7 +903,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE,C0; @@ -1048,7 +1048,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; @@ -1084,7 +1084,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -1224,7 +1224,7 @@ ALIGN_5 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else TEST $2, kkk; #endif JLE .L9_loopE; @@ -1462,7 +1462,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif @@ -1498,7 +1498,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -1843,7 +1843,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif @@ -1876,7 +1876,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -2090,7 +2090,7 @@ SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif @@ -2152,7 +2152,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -2795,7 +2795,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif @@ -2832,7 +2832,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -2984,7 +2984,7 @@ ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else TEST $2, kkk; #endif JLE .L222_loopE; @@ -3205,7 +3205,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif @@ -3238,7 +3238,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -3337,7 +3337,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -ALIGN_5 +ALIGN_5 .L232_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3471,7 +3471,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif @@ -3503,7 +3503,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -3646,7 +3646,7 @@ SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; @@ -3698,7 +3698,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -3913,7 +3913,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; @@ -3945,7 +3945,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -4098,7 +4098,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; @@ -4128,7 +4128,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -4270,7 +4270,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; @@ -4300,7 +4300,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -4413,7 +4413,7 @@ SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; diff --git a/kernel/x86_64/cgemv_n.S b/kernel/x86_64/cgemv_n.S index 64967d4bf..206beb673 100644 --- a/kernel/x86_64/cgemv_n.S +++ b/kernel/x86_64/cgemv_n.S @@ -48,7 +48,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) @@ -75,7 +75,7 @@ #else #define STACKSIZE 288 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -207,7 +207,7 @@ ALIGN_3 subq $-32 * SIZE, A - + movq BUFFER, Y1 pxor %xmm4, %xmm4 @@ -281,7 +281,7 @@ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 - + pshufd $0x00, %xmm9, %xmm8 pshufd $0x55, %xmm9, %xmm9 pshufd $0x00, %xmm11, %xmm10 @@ -875,7 +875,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 @@ -926,7 +926,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -1134,7 +1134,7 @@ pshufd $0xb1, %xmm6, %xmm7 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm6 @@ -1206,7 +1206,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -1244,12 +1244,12 @@ movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif - + pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 @@ -1285,7 +1285,7 @@ movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -1449,7 +1449,7 @@ MOVUPS_A1(-32 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -1469,7 +1469,7 @@ movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -1515,7 +1515,7 @@ pcmpeqb %xmm7, %xmm7 psllq $63, %xmm7 - + pshufd $0x00, %xmm9, %xmm8 pshufd $0x55, %xmm9, %xmm9 pshufd $0x00, %xmm11, %xmm10 @@ -2130,7 +2130,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 @@ -2181,7 +2181,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -2399,7 +2399,7 @@ pshufd $0xb1, %xmm6, %xmm7 pshufd $0xb1, %xmm8, %xmm9 pshufd $0xb1, %xmm10, %xmm11 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm12, %xmm6 @@ -2472,7 +2472,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -2512,12 +2512,12 @@ movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif - + pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 @@ -2553,7 +2553,7 @@ movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -2717,7 +2717,7 @@ MOVUPS_A1(-32 * SIZE, A1, %xmm4) pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -2737,7 +2737,7 @@ movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -2780,7 +2780,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 @@ -2831,7 +2831,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -3165,7 +3165,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -3200,12 +3200,12 @@ movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif - + pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 @@ -3241,7 +3241,7 @@ movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -3454,7 +3454,7 @@ movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -3493,7 +3493,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 pshufd $0x00, %xmm15, %xmm14 @@ -3544,7 +3544,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -3878,7 +3878,7 @@ pshufd $0xb1, %xmm4, %xmm5 pshufd $0xb1, %xmm6, %xmm7 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -3913,12 +3913,12 @@ movsd ALPHA, %xmm8 unpcklpd %xmm8, %xmm8 #endif - + pshufd $0xb1, %xmm8, %xmm9 pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - + pshufd $0x00, %xmm13, %xmm12 pshufd $0x55, %xmm13, %xmm13 @@ -3954,7 +3954,7 @@ movsd -32 * SIZE(Y1), %xmm0 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 @@ -4167,7 +4167,7 @@ movsd -32 * SIZE(A1), %xmm4 pshufd $0xb1, %xmm4, %xmm5 - + mulps %xmm12, %xmm4 addps %xmm4, %xmm0 mulps %xmm13, %xmm5 diff --git a/kernel/x86_64/cgemv_t.S b/kernel/x86_64/cgemv_t.S index 49fc0eb36..430586bba 100644 --- a/kernel/x86_64/cgemv_t.S +++ b/kernel/x86_64/cgemv_t.S @@ -48,7 +48,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) @@ -60,7 +60,7 @@ #define LDAX 88(%rsp) #define ALPHAR 96(%rsp) #define ALPHAI 104(%rsp) - + #define M %rdi #define N %rsi #define A %rcx @@ -73,7 +73,7 @@ #else #define STACKSIZE 288 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -202,7 +202,7 @@ jle .L999 subq $-32 * SIZE, A - + movq BUFFER, X1 #ifdef ALIGNED_ACCESS @@ -893,7 +893,7 @@ movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 - + addps %xmm2, %xmm0 addps %xmm6, %xmm4 #endif @@ -1306,7 +1306,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -1576,7 +1576,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -2227,7 +2227,7 @@ movaps %xmm4, %xmm6 shufps $0x88, %xmm5, %xmm4 shufps $0xdd, %xmm5, %xmm6 - + addps %xmm2, %xmm0 addps %xmm6, %xmm4 #endif @@ -2653,7 +2653,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -2923,7 +2923,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -3365,7 +3365,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -3650,7 +3650,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -4087,7 +4087,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif @@ -4372,7 +4372,7 @@ movaps %xmm0, %xmm2 shufps $0x88, %xmm1, %xmm0 shufps $0xdd, %xmm1, %xmm2 - + addps %xmm2, %xmm0 #endif diff --git a/kernel/x86_64/copy.S b/kernel/x86_64/copy.S index bb66d1019..5729b2956 100644 --- a/kernel/x86_64/copy.S +++ b/kernel/x86_64/copy.S @@ -50,18 +50,18 @@ #define INCY %r10 #define FLAG %r11 #endif - + #include "l1param.h" - + PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif EMMS - + testq N, N # if m == 0 goto End jle .L999 @@ -363,4 +363,4 @@ ret EPILOGUE - + diff --git a/kernel/x86_64/daxpy_bulldozer.S b/kernel/x86_64/daxpy_bulldozer.S index dfc10e80f..799dad02d 100644 --- a/kernel/x86_64/daxpy_bulldozer.S +++ b/kernel/x86_64/daxpy_bulldozer.S @@ -59,7 +59,7 @@ #define A_PRE 640 #include "l1param.h" - + PROLOGUE PROFCODE @@ -88,7 +88,7 @@ testq M, M jle .L47 - + cmpq $SIZE, INCX jne .L40 cmpq $SIZE, INCY @@ -290,7 +290,7 @@ je .L46 cmpq $0, INCY je .L46 - + sarq $3, %rax jle .L45 diff --git a/kernel/x86_64/ddot_bulldozer.S b/kernel/x86_64/ddot_bulldozer.S index 503ec60cf..61c757116 100644 --- a/kernel/x86_64/ddot_bulldozer.S +++ b/kernel/x86_64/ddot_bulldozer.S @@ -62,8 +62,8 @@ SAVEREGISTERS - leaq (, INCX, SIZE), INCX - leaq (, INCY, SIZE), INCY + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY vxorps %xmm0, %xmm0 , %xmm0 vxorps %xmm1, %xmm1 , %xmm1 diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index 3b1b2560e..e86d30625 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -138,10 +138,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define TEST testq #define OR orq #define JNE jne -#define NOP +#define NOP #define XOR xorpd #undef MOVQ -#define MOVQ movq +#define MOVQ movq #define XOR_DY vxorpd #define XOR_DX vxorpd @@ -215,7 +215,7 @@ movq %r15, 40(%rsp); movq ARG1, old_bm movq ARG2, old_bn movq ARG3, old_bk - movq OLD_A, ba + movq OLD_A, ba movq OLD_B, bb movq OLD_C, C movq old_ldc, ldc @@ -269,7 +269,7 @@ ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -305,7 +305,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -316,7 +316,7 @@ ALIGN_5; .L2_bodyB:; # Computing kernel -#### Unroll times 1 #### +#### Unroll times 1 #### LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; @@ -372,7 +372,7 @@ MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec10, yvec6, yvec10; ADD_DY yvec8, yvec7, yvec8; -#### Unroll times 3 #### +#### Unroll times 3 #### LD_DY 20*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; SHUF_DY $0x03, yvec2, yvec2, yvec4; @@ -438,14 +438,14 @@ PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb; #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L3_loopE; ALIGN_5 .L3_bodyB: -#### Unroll times 1 #### +#### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; @@ -508,14 +508,14 @@ PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb #ifndef TRMMKERNEL TEST $1, bk; -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L4_loopE; ALIGN_5 .L4_bodyB:; -#### Unroll times 1 #### +#### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) LD_DY 4*SIZE(ptrba), yvec1; MUL_DY yvec0, yvec2, yvec6; @@ -767,8 +767,8 @@ JLE .L5_loopE; ALIGN_5 .L5_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) -MOVQ bb, ptrbb; -#else +MOVQ bb, ptrbb; +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -793,7 +793,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -919,7 +919,7 @@ ADD_DY yvec9, yvec7, yvec9; .L7_loopE:; #ifndef TRMMKERNEL TEST $1, bk -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -1067,8 +1067,8 @@ JLE .L9_loopE; ALIGN_5 .L9_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) -MOVQ bb, ptrbb; -#else +MOVQ bb, ptrbb; +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -1090,7 +1090,7 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; @@ -1103,7 +1103,7 @@ SARQ $2, k; JLE .L10_loopE; ALIGN_5; .L10_bodyB:; -# Computing kernel +# Computing kernel ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; @@ -1180,7 +1180,7 @@ ALIGN_5 .L10_loopE:; #ifndef TRMMKERNEL TEST $2, bk -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif @@ -1337,7 +1337,7 @@ ALIGN_5 .L13_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; @@ -1356,7 +1356,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -1413,7 +1413,7 @@ ADDQ $8*SIZE, ptrbb; .L15_loopE:; #ifndef TRMMKERNEL TEST $1, bk; -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -1428,7 +1428,7 @@ ADDQ $1*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L16_loopE: -#### Load Alpha #### +#### Load Alpha #### BROAD_DY MEMALPHA, yvec7; #### Multiply Alpha #### MUL_DY yvec15, yvec7, yvec15; @@ -1489,7 +1489,7 @@ ALIGN_5; .L21_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -1511,11 +1511,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -1524,7 +1524,7 @@ SARQ $2, k; JLE .L211_loopE; ALIGN_5; .L211_bodyB: -# Computing kernel +# Computing kernel #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; @@ -1680,14 +1680,14 @@ ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L212_loopE; ALIGN_5; .L212_bodyB: -# Computing kernel +# Computing kernel #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; @@ -1767,7 +1767,7 @@ ADD_DX xvec7, xvec8, xvec8; .L212_loopE: #ifndef TRMMKERNEL TEST $1, bk; -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -1944,7 +1944,7 @@ ALIGN_5; .L22_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; @@ -1962,11 +1962,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -1975,7 +1975,7 @@ SARQ $2, k; JLE .L221_loopE; ALIGN_5 .L221_bodyB:; -# Computing kernel +# Computing kernel #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; @@ -2059,7 +2059,7 @@ ALIGN_5 .L221_loopE:; #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif @@ -2108,7 +2108,7 @@ ADD_DX xvec5, xvec10, xvec10; .L222_loopE: #ifndef TRMMKERNEL TEST $1, bk -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -2225,7 +2225,7 @@ ALIGN_5; .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; @@ -2240,11 +2240,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -2253,7 +2253,7 @@ SARQ $2, k; JLE .L231_loopE; ALIGN_5 .L231_bodyB: -# Computing kernel +# Computing kernel #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; @@ -2297,7 +2297,7 @@ ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif @@ -2326,7 +2326,7 @@ ADDQ $4*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL TEST $1, bk; -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -2413,7 +2413,7 @@ ALIGN_5; .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -2427,13 +2427,13 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $2, %rax; -#endif +#endif MOVQ %rax, kkk; #endif SARQ $2, k; @@ -2467,7 +2467,7 @@ ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif @@ -2488,7 +2488,7 @@ ADDQ $4*SIZE, ptrbb; .L242_loopE: #ifndef TRMMKERNEL TEST $1, bk; -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -2550,7 +2550,7 @@ ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax; @@ -2566,11 +2566,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -2622,7 +2622,7 @@ ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif @@ -2769,11 +2769,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk @@ -2809,7 +2809,7 @@ ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif @@ -2831,7 +2831,7 @@ ADDQ $2*SIZE, ptrbb; .L322_loopE: #ifndef TRMMKERNEL TEST $1, bk; -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -2909,13 +2909,13 @@ ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax LEAQ (ptrba, %rax, 2), ptrba ADDQ %rax, ptrbb; -#endif +#endif #### Initial Result #### XOR_DY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL @@ -2924,7 +2924,7 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; @@ -2964,7 +2964,7 @@ ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2,bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax #endif @@ -2985,7 +2985,7 @@ ADDQ $2*SIZE, ptrbb; .L332_loopE: #ifndef TRMMKERNEL TEST $1, bk; -#else +#else MOVQ kkk, %rax; TEST $1, %rax; #endif @@ -3025,9 +3025,9 @@ TEST $1, bm JLE .L34_loopE; ALIGN_5 .L34_bodyB: -#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -3041,7 +3041,7 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; @@ -3081,7 +3081,7 @@ ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; -#else +#else MOVQ kkk, %rax; TEST $2, %rax; #endif diff --git a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S index 7b5dd1587..66779648b 100644 --- a/kernel/x86_64/dgemm_kernel_6x4_piledriver.S +++ b/kernel/x86_64/dgemm_kernel_6x4_piledriver.S @@ -12,27 +12,27 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the OpenBLAS project nor the names of - its contributors may be used to endorse or promote products + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ // register blocking= 6x4. unloop k = 4. -// Use FMA3 on piledriver. +// Use FMA3 on piledriver. // Todo: 1) deal with the edge. 2) Add windows abi. - + #define ASSEMBLER #include "common.h" @@ -89,7 +89,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SALQ1280(imm,n) salq imm,n #define JG jg -#define JLE jle +#define JLE jle #define VLD2560(addr,reg) vmovapd addr,reg #define VST2560(reg,addr) vmovapd reg,addr @@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define imm1 $0x05 #define imm3 $0x05 #define imm100 $0x05 -#define imm200 $0x0a +#define imm200 $0x0a #define XMM0 %xmm0 #define XMM1 %xmm1 diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S index e4bde49bd..532fddf75 100644 --- a/kernel/x86_64/dgemm_ncopy_2.S +++ b/kernel/x86_64/dgemm_ncopy_2.S @@ -87,7 +87,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S index 1e4431664..41eac9597 100644 --- a/kernel/x86_64/dgemm_ncopy_4.S +++ b/kernel/x86_64/dgemm_ncopy_4.S @@ -107,7 +107,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S index f35c3c5af..7600c9a56 100644 --- a/kernel/x86_64/dgemm_ncopy_8.S +++ b/kernel/x86_64/dgemm_ncopy_8.S @@ -93,7 +93,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 diff --git a/kernel/x86_64/dgemm_ncopy_8_bulldozer.S b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S index 1b934f6bb..43f9cd2e3 100644 --- a/kernel/x86_64/dgemm_ncopy_8_bulldozer.S +++ b/kernel/x86_64/dgemm_ncopy_8_bulldozer.S @@ -81,7 +81,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S index b0b3590aa..9881610d7 100644 --- a/kernel/x86_64/dgemm_tcopy_2.S +++ b/kernel/x86_64/dgemm_tcopy_2.S @@ -114,7 +114,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi @@ -326,7 +326,7 @@ movlpd %xmm0, -16 * SIZE(B3) ALIGN_4 - + .L999: popq %rbp popq %r13 diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S index 85b0253d7..98ba6473c 100644 --- a/kernel/x86_64/dgemm_tcopy_4.S +++ b/kernel/x86_64/dgemm_tcopy_4.S @@ -116,7 +116,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi @@ -505,7 +505,7 @@ movlpd %xmm0, -16 * SIZE(B3) jmp .L999 ALIGN_4 - + .L999: popq %rbp popq %r12 diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S index 3d411cda5..db97db7f1 100644 --- a/kernel/x86_64/dgemm_tcopy_8.S +++ b/kernel/x86_64/dgemm_tcopy_8.S @@ -99,7 +99,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi @@ -770,7 +770,7 @@ movlpd %xmm0, -16 * SIZE(B3) jmp .L999 ALIGN_4 - + .L999: popq %rbp popq %r12 diff --git a/kernel/x86_64/dgemm_tcopy_8_bulldozer.S b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S index d7fc416d9..a9dd25389 100644 --- a/kernel/x86_64/dgemm_tcopy_8_bulldozer.S +++ b/kernel/x86_64/dgemm_tcopy_8_bulldozer.S @@ -82,7 +82,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi @@ -650,7 +650,7 @@ vmovsd %xmm0, -16 * SIZE(B3) jmp .L999 ALIGN_4 - + .L999: popq %rbp popq %r12 diff --git a/kernel/x86_64/dgemv_n.S b/kernel/x86_64/dgemv_n.S index 5f4c40467..58dd43bbb 100644 --- a/kernel/x86_64/dgemv_n.S +++ b/kernel/x86_64/dgemv_n.S @@ -48,7 +48,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx @@ -67,7 +67,7 @@ #else #define STACKSIZE 288 - + #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) @@ -113,7 +113,7 @@ #define TMP_M %r15 #define Y2 %rbx - + PROLOGUE PROFCODE @@ -176,7 +176,7 @@ addq M, I jle .L999x movq I, M - + .L00t: movq XX,X movq AA,A @@ -203,7 +203,7 @@ testq $SIZE, A cmoveq M, MM #endif - + testq N, N # if n <= 0 goto END jle .L999 testq M, M # if n <= 0 goto END @@ -221,7 +221,7 @@ #endif movq BUFFER, Y1 - + pxor %xmm4, %xmm4 movq M, %rax @@ -316,7 +316,7 @@ movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif - + mulpd %xmm0, %xmm8 mulpd %xmm0, %xmm9 mulpd %xmm0, %xmm10 @@ -875,7 +875,7 @@ movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif - + mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm14 @@ -1409,7 +1409,7 @@ .L36: testq $2, MM je .L37 - + MOVUPS_A1(-16 * SIZE, A1, %xmm8) MOVUPS_A1(-16 * SIZE, A2, %xmm9) @@ -1675,7 +1675,7 @@ movsd ALPHA, %xmm0 unpcklpd %xmm0, %xmm0 #endif - + mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm13 mulpd %xmm0, %xmm14 @@ -2241,7 +2241,7 @@ .L66: testq $2, MM je .L67 - + MOVUPS_A1(-16 * SIZE, A1, %xmm4) MOVUPS_A1(-15 * SIZE, A2, %xmm5) @@ -2469,7 +2469,7 @@ #endif movq M, TMP_M movq Y, Y1 - + cmpq $SIZE, INCY jne .L950 @@ -2702,7 +2702,7 @@ jmp .L999 ALIGN_4 -.L950: +.L950: testq $SIZE, BUFFER je .L960 diff --git a/kernel/x86_64/dgemv_n_atom.S b/kernel/x86_64/dgemv_n_atom.S index 27a763a6b..ed6a58579 100644 --- a/kernel/x86_64/dgemv_n_atom.S +++ b/kernel/x86_64/dgemv_n_atom.S @@ -47,7 +47,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 64 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) @@ -66,7 +66,7 @@ #else #define STACKSIZE 256 - + #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) @@ -87,7 +87,7 @@ #define INCY %r10 #endif - + #define I %rax #define J %r11 #define A1 %r12 @@ -95,7 +95,7 @@ #define Y1 %r14 #define BUFFER %r15 #define MM %rbx - + #define ALPHA %xmm15 PROLOGUE diff --git a/kernel/x86_64/dgemv_n_bulldozer.S b/kernel/x86_64/dgemv_n_bulldozer.S index ef2c4e23f..bc00d67fc 100644 --- a/kernel/x86_64/dgemv_n_bulldozer.S +++ b/kernel/x86_64/dgemv_n_bulldozer.S @@ -57,7 +57,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 64 - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx @@ -71,7 +71,7 @@ #else #define STACKSIZE 256 - + #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) @@ -172,7 +172,7 @@ testq $SIZE, A cmoveq M, MM #endif - + testq N, N # if n <= 0 goto END jle .L999 testq M, M # if n <= 0 goto END @@ -190,7 +190,7 @@ #endif movq BUFFER, Y1 - + vxorpd %xmm4, %xmm4, %xmm4 movq M, %rax @@ -255,7 +255,7 @@ addq INCX, X vmovddup ALPHA, %xmm0 - + vmulpd %xmm0, %xmm8 , %xmm8 vmulpd %xmm0, %xmm9 , %xmm9 vmulpd %xmm0, %xmm10 , %xmm10 @@ -561,7 +561,7 @@ vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0 vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0 - + vmovsd %xmm0, -16 * SIZE(Y1) ALIGN_3 @@ -1035,7 +1035,7 @@ .L36: testq $2, MM je .L37 - + VMOVUPS_A1(-16 * SIZE, A1, %xmm8) VMOVUPS_A1(-16 * SIZE, A2, %xmm9) @@ -1255,7 +1255,7 @@ addq INCX, X vmovddup ALPHA, %xmm0 - + vmulpd %xmm0, %xmm12 , %xmm12 vmulpd %xmm0, %xmm13 , %xmm13 vmulpd %xmm0, %xmm14 , %xmm14 @@ -1306,7 +1306,7 @@ .L53: - + vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0 VMOVUPS_A1(-10 * SIZE, A1, %xmm7) @@ -1724,7 +1724,7 @@ .L66: testq $2, MM je .L67 - + VMOVUPS_A1(-16 * SIZE, A1, %xmm4) VMOVUPS_A1(-15 * SIZE, A2, %xmm5) @@ -2142,7 +2142,7 @@ jmp .L999 ALIGN_4 -.L950: +.L950: testq $SIZE, BUFFER je .L960 diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S index 3d132c3b5..927777416 100644 --- a/kernel/x86_64/dgemv_t.S +++ b/kernel/x86_64/dgemv_t.S @@ -48,7 +48,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx @@ -64,7 +64,7 @@ #else #define STACKSIZE 256 - + #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) @@ -180,7 +180,7 @@ jle .L999x movq %rax,M -.L00: +.L00: movq LDAX,LDA movq NN,N movq AA,A @@ -205,7 +205,7 @@ jle .L999 movq BUFFER, X1 - + #ifdef ALIGNED_ACCESS testq $SIZE, A je .L01 diff --git a/kernel/x86_64/dgemv_t_atom.S b/kernel/x86_64/dgemv_t_atom.S index 246bdd3e4..1e63c427b 100644 --- a/kernel/x86_64/dgemv_t_atom.S +++ b/kernel/x86_64/dgemv_t_atom.S @@ -47,12 +47,12 @@ #ifndef WINDOWS_ABI #define STACKSIZE 64 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) - + #define M %rdi #define N %rsi #define A %rcx @@ -66,7 +66,7 @@ #else #define STACKSIZE 256 - + #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) @@ -95,7 +95,7 @@ #define Y1 %r15 #define ALPHA %xmm3 - + PROLOGUE PROFCODE @@ -130,7 +130,7 @@ movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER - + leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (, LDA, SIZE), LDA diff --git a/kernel/x86_64/dgemv_t_bulldozer.S b/kernel/x86_64/dgemv_t_bulldozer.S index 36ae2b9df..9cd44ee2f 100644 --- a/kernel/x86_64/dgemv_t_bulldozer.S +++ b/kernel/x86_64/dgemv_t_bulldozer.S @@ -57,7 +57,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx @@ -73,7 +73,7 @@ #else #define STACKSIZE 256 - + #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) @@ -185,7 +185,7 @@ jle .L999x movq %rax,M -.L00: +.L00: movq LDAX,LDA movq NN,N movq AA,A @@ -210,7 +210,7 @@ jle .L999 movq BUFFER, X1 - + movq M, I sarq $3, I jle .L05 @@ -932,7 +932,7 @@ vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0 vfmaddpd %xmm1 , -16 * SIZE(A2) , %xmm12 , %xmm1 - + addq $2 * SIZE, A1 addq $2 * SIZE, A2 addq $2 * SIZE, X1 diff --git a/kernel/x86_64/dot_atom.S b/kernel/x86_64/dot_atom.S index bc67b28d3..794cf1422 100644 --- a/kernel/x86_64/dot_atom.S +++ b/kernel/x86_64/dot_atom.S @@ -60,9 +60,9 @@ SAVEREGISTERS - leaq (, INCX, SIZE), INCX + leaq (, INCX, SIZE), INCX pxor %xmm0, %xmm0 - leaq (, INCY, SIZE), INCY + leaq (, INCY, SIZE), INCY pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S index 985ce9fec..688622259 100644 --- a/kernel/x86_64/dot_sse.S +++ b/kernel/x86_64/dot_sse.S @@ -60,8 +60,8 @@ SAVEREGISTERS - leaq (, INCX, SIZE), INCX - leaq (, INCY, SIZE), INCY + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 @@ -1278,7 +1278,7 @@ #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 diff --git a/kernel/x86_64/dot_sse2.S b/kernel/x86_64/dot_sse2.S index 875bf4e8b..ceb2d0c29 100644 --- a/kernel/x86_64/dot_sse2.S +++ b/kernel/x86_64/dot_sse2.S @@ -60,8 +60,8 @@ SAVEREGISTERS - leaq (, INCX, SIZE), INCX - leaq (, INCY, SIZE), INCY + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY xorps %xmm0, %xmm0 xorps %xmm1, %xmm1 diff --git a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S index 9e15fa240..bccf1c908 100644 --- a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S +++ b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -147,79 +147,79 @@ vmovddup -16 * SIZE(AO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(AO), %xmm9 - vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 + vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 vmovddup -14 * SIZE(AO), %xmm10 - vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2 + vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2 vmovddup -13 * SIZE(AO), %xmm11 - vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3 + vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3 vmovddup -12 * SIZE(AO), %xmm8 - vfnmaddpd %xmm4 , %xmm0 , %xmm8 , %xmm4 + vfnmaddpd %xmm4 , %xmm0 , %xmm8 , %xmm4 vmovddup -11 * SIZE(AO), %xmm9 - vfnmaddpd %xmm5 , %xmm0 , %xmm9 , %xmm5 + vfnmaddpd %xmm5 , %xmm0 , %xmm9 , %xmm5 vmovddup -10 * SIZE(AO), %xmm10 - vfnmaddpd %xmm6 , %xmm0 , %xmm10, %xmm6 + vfnmaddpd %xmm6 , %xmm0 , %xmm10, %xmm6 vmovddup -9 * SIZE(AO), %xmm11 - vfnmaddpd %xmm7 , %xmm0 , %xmm11, %xmm7 + vfnmaddpd %xmm7 , %xmm0 , %xmm11, %xmm7 vmovddup -7 * SIZE(AO), %xmm8 vmulpd %xmm1 , %xmm8 , %xmm1 vmovddup -6 * SIZE(AO), %xmm10 - vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2 + vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2 vmovddup -5 * SIZE(AO), %xmm11 - vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3 + vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3 vmovddup -4 * SIZE(AO), %xmm8 - vfnmaddpd %xmm4 , %xmm1 , %xmm8 , %xmm4 + vfnmaddpd %xmm4 , %xmm1 , %xmm8 , %xmm4 vmovddup -3 * SIZE(AO), %xmm9 - vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5 + vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5 vmovddup -2 * SIZE(AO), %xmm10 - vfnmaddpd %xmm6 , %xmm1 , %xmm10, %xmm6 + vfnmaddpd %xmm6 , %xmm1 , %xmm10, %xmm6 vmovddup -1 * SIZE(AO), %xmm11 - vfnmaddpd %xmm7 , %xmm1 , %xmm11, %xmm7 + vfnmaddpd %xmm7 , %xmm1 , %xmm11, %xmm7 vmovddup 2 * SIZE(AO), %xmm8 vmulpd %xmm2 , %xmm8 , %xmm2 vmovddup 3 * SIZE(AO), %xmm11 - vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3 + vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3 vmovddup 4 * SIZE(AO), %xmm8 - vfnmaddpd %xmm4 , %xmm2 , %xmm8 , %xmm4 + vfnmaddpd %xmm4 , %xmm2 , %xmm8 , %xmm4 vmovddup 5 * SIZE(AO), %xmm9 - vfnmaddpd %xmm5 , %xmm2 , %xmm9 , %xmm5 + vfnmaddpd %xmm5 , %xmm2 , %xmm9 , %xmm5 vmovddup 6 * SIZE(AO), %xmm10 - vfnmaddpd %xmm6 , %xmm2 , %xmm10, %xmm6 + vfnmaddpd %xmm6 , %xmm2 , %xmm10, %xmm6 vmovddup 7 * SIZE(AO), %xmm11 - vfnmaddpd %xmm7 , %xmm2 , %xmm11, %xmm7 + vfnmaddpd %xmm7 , %xmm2 , %xmm11, %xmm7 vmovddup 11 * SIZE(AO), %xmm8 vmulpd %xmm3 , %xmm8 , %xmm3 vmovddup 12 * SIZE(AO), %xmm11 - vfnmaddpd %xmm4 , %xmm3 , %xmm11, %xmm4 + vfnmaddpd %xmm4 , %xmm3 , %xmm11, %xmm4 vmovddup 13 * SIZE(AO), %xmm9 - vfnmaddpd %xmm5 , %xmm3 , %xmm9 , %xmm5 + vfnmaddpd %xmm5 , %xmm3 , %xmm9 , %xmm5 vmovddup 14 * SIZE(AO), %xmm10 - vfnmaddpd %xmm6 , %xmm3 , %xmm10, %xmm6 + vfnmaddpd %xmm6 , %xmm3 , %xmm10, %xmm6 vmovddup 15 * SIZE(AO), %xmm11 - vfnmaddpd %xmm7 , %xmm3 , %xmm11, %xmm7 + vfnmaddpd %xmm7 , %xmm3 , %xmm11, %xmm7 vmovddup 20 * SIZE(AO), %xmm8 vmulpd %xmm4 , %xmm8 , %xmm4 vmovddup 21 * SIZE(AO), %xmm9 - vfnmaddpd %xmm5 , %xmm4 , %xmm9 , %xmm5 + vfnmaddpd %xmm5 , %xmm4 , %xmm9 , %xmm5 vmovddup 22 * SIZE(AO), %xmm10 - vfnmaddpd %xmm6 , %xmm4 , %xmm10, %xmm6 + vfnmaddpd %xmm6 , %xmm4 , %xmm10, %xmm6 vmovddup 23 * SIZE(AO), %xmm11 - vfnmaddpd %xmm7 , %xmm4 , %xmm11, %xmm7 + vfnmaddpd %xmm7 , %xmm4 , %xmm11, %xmm7 vmovddup 29 * SIZE(AO), %xmm8 vmulpd %xmm5 , %xmm8 , %xmm5 vmovddup 30 * SIZE(AO), %xmm10 - vfnmaddpd %xmm6 , %xmm5 , %xmm10, %xmm6 + vfnmaddpd %xmm6 , %xmm5 , %xmm10, %xmm6 vmovddup 31 * SIZE(AO), %xmm11 - vfnmaddpd %xmm7 , %xmm5 , %xmm11, %xmm7 + vfnmaddpd %xmm7 , %xmm5 , %xmm11, %xmm7 vmovddup 38 * SIZE(AO), %xmm8 vmulpd %xmm6 , %xmm8 , %xmm6 vmovddup 39 * SIZE(AO), %xmm11 - vfnmaddpd %xmm7 , %xmm6 , %xmm11, %xmm7 + vfnmaddpd %xmm7 , %xmm6 , %xmm11, %xmm7 vmovddup 47 * SIZE(AO), %xmm8 vmulpd %xmm7 , %xmm8 , %xmm7 @@ -292,23 +292,23 @@ vmovddup -16 * SIZE(AO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(AO), %xmm9 - vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 + vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 vmovddup -14 * SIZE(AO), %xmm10 - vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2 + vfnmaddpd %xmm2 , %xmm0 , %xmm10, %xmm2 vmovddup -13 * SIZE(AO), %xmm11 - vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3 + vfnmaddpd %xmm3 , %xmm0 , %xmm11, %xmm3 vmovddup -11 * SIZE(AO), %xmm8 vmulpd %xmm1 , %xmm8 , %xmm1 vmovddup -10 * SIZE(AO), %xmm10 - vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2 + vfnmaddpd %xmm2 , %xmm1 , %xmm10, %xmm2 vmovddup -9 * SIZE(AO), %xmm11 - vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3 + vfnmaddpd %xmm3 , %xmm1 , %xmm11, %xmm3 vmovddup -6 * SIZE(AO), %xmm8 vmulpd %xmm2 , %xmm8 , %xmm2 vmovddup -5 * SIZE(AO), %xmm11 - vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3 + vfnmaddpd %xmm3 , %xmm2 , %xmm11, %xmm3 vmovddup -1 * SIZE(AO), %xmm8 vmulpd %xmm3 , %xmm8 , %xmm3 @@ -356,7 +356,7 @@ vmovddup -16 * SIZE(AO), %xmm8 vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(AO), %xmm9 - vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 + vfnmaddpd %xmm1 , %xmm0 , %xmm9 , %xmm1 vmovddup -13 * SIZE(AO), %xmm8 vmulpd %xmm1 , %xmm8 , %xmm1 @@ -617,7 +617,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) @@ -758,7 +758,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 /*********************************************************************************/ @@ -828,7 +828,7 @@ addq $4, KK - ALIGN_4 + ALIGN_4 /*********************************************************************************/ @@ -896,7 +896,7 @@ addq $2, KK - ALIGN_4 + ALIGN_4 /********************************************************************************/ .L70: testq $1, M @@ -961,8 +961,8 @@ addq $1, KK - ALIGN_4 - + ALIGN_4 + .L79: movq BO, B @@ -1048,7 +1048,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 /*****************************************************************************/ .L90_A: @@ -1113,7 +1113,7 @@ addq $4, KK - ALIGN_4 + ALIGN_4 /*************************************************************************************/ .L100: @@ -1178,7 +1178,7 @@ addq $2, KK - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -1242,7 +1242,7 @@ addq $1, KK - ALIGN_4 + ALIGN_4 .L119: @@ -1251,7 +1251,7 @@ ALIGN_4 - + .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S index 8d3964aee..9f693f852 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S +++ b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -133,10 +133,10 @@ vmulpd %xmm3 , %xmm8 , %xmm3 vmovddup -15 * SIZE(BO), %xmm9 - vfnmaddpd %xmm4 , %xmm0 , %xmm9 , %xmm4 - vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5 - vfnmaddpd %xmm6 , %xmm2 , %xmm9 , %xmm6 - vfnmaddpd %xmm7 , %xmm3 , %xmm9 , %xmm7 + vfnmaddpd %xmm4 , %xmm0 , %xmm9 , %xmm4 + vfnmaddpd %xmm5 , %xmm1 , %xmm9 , %xmm5 + vfnmaddpd %xmm6 , %xmm2 , %xmm9 , %xmm6 + vfnmaddpd %xmm7 , %xmm3 , %xmm9 , %xmm7 vmovddup -13 * SIZE(BO), %xmm10 vmulpd %xmm4 , %xmm10, %xmm4 @@ -198,8 +198,8 @@ vmulpd %xmm1 , %xmm8 , %xmm1 vmovddup -15 * SIZE(BO), %xmm9 - vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2 - vfnmaddpd %xmm3 , %xmm1 , %xmm9 , %xmm3 + vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2 + vfnmaddpd %xmm3 , %xmm1 , %xmm9 , %xmm3 vmovddup -13 * SIZE(BO), %xmm10 vmulpd %xmm2 , %xmm10, %xmm2 @@ -242,7 +242,7 @@ vmulpd %xmm0 , %xmm8 , %xmm0 vmovddup -15 * SIZE(BO), %xmm9 - vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2 + vfnmaddpd %xmm2 , %xmm0 , %xmm9 , %xmm2 vmovddup -13 * SIZE(BO), %xmm10 vmulpd %xmm2 , %xmm10, %xmm2 @@ -253,7 +253,7 @@ vmovups %xmm0 , -16 * SIZE(AO) vmovups %xmm2 , -14 * SIZE(AO) - + .endm @@ -278,7 +278,7 @@ vmulsd %xmm2 , %xmm8 , %xmm2 vmovsd -15 * SIZE(BO), %xmm9 - vfnmaddsd %xmm0 , %xmm2 , %xmm9 , %xmm0 + vfnmaddsd %xmm0 , %xmm2 , %xmm9 , %xmm0 vmovsd -13 * SIZE(BO), %xmm10 vmulsd %xmm0 , %xmm10, %xmm0 @@ -336,7 +336,7 @@ vmovups %xmm1 , -14 * SIZE(AO) vmovups %xmm2 , -12 * SIZE(AO) vmovups %xmm3 , -10 * SIZE(AO) - + .endm @@ -428,7 +428,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) @@ -566,7 +566,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 /*********************************************************************************/ @@ -634,7 +634,7 @@ leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO - ALIGN_4 + ALIGN_4 /*********************************************************************************/ @@ -700,7 +700,7 @@ leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO - ALIGN_4 + ALIGN_4 /********************************************************************************/ .L70: testq $1, M @@ -763,8 +763,8 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO - ALIGN_4 - + ALIGN_4 + .L79: addq $2, KK // number of values in B # only for RN Kernel @@ -847,7 +847,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 /*****************************************************************************/ .L90_A: @@ -909,7 +909,7 @@ leaq (AO, %rax, 4), AO addq %rax, BO - ALIGN_4 + ALIGN_4 /*************************************************************************************/ .L100: @@ -972,7 +972,7 @@ leaq (AO, %rax, 2), AO addq %rax, BO - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -1034,7 +1034,7 @@ addq %rax, AO addq %rax, BO - ALIGN_4 + ALIGN_4 .L119: @@ -1045,7 +1045,7 @@ ALIGN_4 - + .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/gemm_beta.S b/kernel/x86_64/gemm_beta.S index 461df50e0..09df2b79d 100644 --- a/kernel/x86_64/gemm_beta.S +++ b/kernel/x86_64/gemm_beta.S @@ -118,7 +118,7 @@ #ifdef OPTERON prefetchw 32 * SIZE(C1) #endif - + MOVSD %xmm0, 0 * SIZE(C1) MOVSD %xmm0, 1 * SIZE(C1) MOVSD %xmm0, 2 * SIZE(C1) diff --git a/kernel/x86_64/gemm_kernel_2x8_nehalem.S b/kernel/x86_64/gemm_kernel_2x8_nehalem.S index 24e66d730..7e4b0d863 100644 --- a/kernel/x86_64/gemm_kernel_2x8_nehalem.S +++ b/kernel/x86_64/gemm_kernel_2x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -97,7 +97,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -142,7 +142,7 @@ #endif movlps %xmm0, ALPHA - + subq $-16 * SIZE, A subq $-16 * SIZE, B @@ -156,7 +156,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -202,7 +202,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 @@ -241,7 +241,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -256,7 +256,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 @@ -577,7 +577,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -597,7 +597,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -611,7 +611,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -626,7 +626,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -773,7 +773,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -822,7 +822,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -844,7 +844,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -859,7 +859,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1037,7 +1037,7 @@ decq I BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1057,7 +1057,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1071,7 +1071,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1086,7 +1086,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -1190,7 +1190,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1235,7 +1235,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1253,7 +1253,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1268,7 +1268,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1387,7 +1387,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1407,7 +1407,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1419,7 +1419,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1434,7 +1434,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 @@ -1516,7 +1516,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1560,7 +1560,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1577,7 +1577,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1592,7 +1592,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1683,7 +1683,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $1, M @@ -1703,7 +1703,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif #ifndef TRMMKERNEL movaps -16 * SIZE(AO), %xmm0 @@ -1739,7 +1739,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: mulpd %xmm0, %xmm1 #ifndef TRMMKERNEL movaps -14 * SIZE(AO), %xmm0 @@ -1818,7 +1818,7 @@ #endif movsd %xmm8, (CO1) - ALIGN_4 + ALIGN_4 .L999: movq 0(%rsp), %rbx diff --git a/kernel/x86_64/gemm_kernel_4x2_atom.S b/kernel/x86_64/gemm_kernel_4x2_atom.S index 47b16ceb9..e5f2e9105 100644 --- a/kernel/x86_64/gemm_kernel_4x2_atom.S +++ b/kernel/x86_64/gemm_kernel_4x2_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -90,7 +90,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -134,27 +134,27 @@ #endif movsd %xmm0, ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif #endif - +#endif + leaq (, LDC, SIZE), LDC movq N, J sarq $1, J jle .L40 ALIGN_4 - + .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 leaq (C, LDC, 1), CO2 @@ -165,7 +165,7 @@ movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB - + movq M, I sarq $2, I jle .L20 @@ -182,7 +182,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -214,7 +214,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -237,7 +237,7 @@ addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + addsd %xmm6, %xmm15 movaps %xmm4, %xmm6 mulsd %xmm1, %xmm4 @@ -499,7 +499,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -521,7 +521,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -711,7 +711,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 @@ -728,7 +728,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -846,7 +846,7 @@ addq $1, KK #endif ALIGN_4 - + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -864,7 +864,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 addq LDC, C @@ -887,7 +887,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 @@ -911,7 +911,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1097,7 +1097,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -1114,7 +1114,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1250,7 +1250,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 @@ -1269,7 +1269,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1355,7 +1355,7 @@ movsd %xmm8, 0 * SIZE(CO1) ALIGN_4 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/gemm_kernel_4x4_barcelona.S b/kernel/x86_64/gemm_kernel_4x4_barcelona.S index f7015c04f..9a29a800d 100644 --- a/kernel/x86_64/gemm_kernel_4x4_barcelona.S +++ b/kernel/x86_64/gemm_kernel_4x4_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -295,7 +295,7 @@ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax ;\ - + #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO), %xmm1 ;\ @@ -400,7 +400,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) @@ -458,7 +458,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq N, J sarq $2, J # j = (n >> 2) @@ -468,13 +468,13 @@ .L01: movq C, CO1 # coffset1 = c leaq (C, LDC, 2), CO2 # coffset2 = c + ldc - + leaq (C, LDC, 4), C # c += 4 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq A, AO # aoffset = a @@ -497,7 +497,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -526,7 +526,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -784,7 +784,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -804,7 +804,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -821,7 +821,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -981,7 +981,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -996,7 +996,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1013,7 +1013,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1129,13 +1129,13 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK #endif - + movq BO, B decq J # j -- @@ -1154,7 +1154,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1179,7 +1179,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 @@ -1202,7 +1202,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1359,7 +1359,7 @@ addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1376,7 +1376,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1392,7 +1392,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1480,7 +1480,7 @@ .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 - + #ifndef TRMMKERNEL movupd (CO1), %xmm0 movupd (CO2), %xmm2 @@ -1514,7 +1514,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1531,7 +1531,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1547,7 +1547,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1646,8 +1646,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK @@ -1667,7 +1667,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1687,7 +1687,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO -#endif +#endif movapd -8 * SIZE(AO), %xmm2 xorps %xmm8, %xmm8 @@ -1706,7 +1706,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1824,7 +1824,7 @@ addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -1841,7 +1841,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm0 xorps %xmm8, %xmm8 @@ -1857,7 +1857,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1958,7 +1958,7 @@ addq $2 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -1975,7 +1975,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1987,7 +1987,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2063,7 +2063,7 @@ movsd %xmm8, (CO1) ALIGN_4 - + .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/gemm_kernel_4x4_core2.S b/kernel/x86_64/gemm_kernel_4x4_core2.S index fa79fe0c5..2f2ddc875 100644 --- a/kernel/x86_64/gemm_kernel_4x4_core2.S +++ b/kernel/x86_64/gemm_kernel_4x4_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -91,7 +91,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -142,7 +142,7 @@ unpcklpd %xmm0, %xmm0 movapd %xmm0, ALPHA - + subq $-16 * SIZE, A subq $-16 * SIZE, B @@ -156,7 +156,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq N, J @@ -168,11 +168,11 @@ .L01: /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movapd -16 * SIZE(B), %xmm0 movapd -8 * SIZE(B), %xmm4 @@ -182,7 +182,7 @@ NOBRANCH jle .L05 ALIGN_3 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) prefetcht0 (PREFETCH_R + 8) * SIZE(B) @@ -274,7 +274,7 @@ BRANCH jne .L06 ALIGN_4 - + .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -300,7 +300,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif prefetcht2 (BB) @@ -334,7 +334,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -349,7 +349,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PADDING; addpd %xmm2, %xmm10 movaps -15 * SIZE(BO), %xmm2 @@ -597,7 +597,7 @@ BRANCH jg .L11 jmp .L20 - ALIGN_4 + ALIGN_4 .L18x: #ifndef TRMMKERNEL @@ -665,7 +665,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -683,7 +683,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 @@ -701,7 +701,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -716,7 +716,7 @@ jle .L25 ALIGN_4 -.L21: +.L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 @@ -872,7 +872,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 ALIGN_4 - + .L30: testq $1, M BRANCH @@ -889,7 +889,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm0 @@ -908,7 +908,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -923,7 +923,7 @@ jle .L35 ALIGN_4 -.L31: +.L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 @@ -1088,11 +1088,11 @@ .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $3, %rax @@ -1100,7 +1100,7 @@ addq %rax, %rax ALIGN_4 - + .L42: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -1146,7 +1146,7 @@ subq $1, %rax jne .L44 ALIGN_4 - + .L45: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -1169,7 +1169,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1193,7 +1193,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1207,7 +1207,7 @@ jle .L55 ALIGN_4 -.L51: +.L51: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 @@ -1369,7 +1369,7 @@ addq $4 * SIZE, CO2 subq $1, I jg .L50 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1386,7 +1386,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 @@ -1406,7 +1406,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1420,7 +1420,7 @@ jle .L65 ALIGN_4 -.L61: +.L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 @@ -1531,7 +1531,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 - + .L70: testq $1, M jle .L79 @@ -1547,7 +1547,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 @@ -1566,7 +1566,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1580,7 +1580,7 @@ jle .L75 ALIGN_4 -.L71: +.L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 @@ -1702,11 +1702,11 @@ .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $4, %rax @@ -1714,7 +1714,7 @@ addq %rax, %rax ALIGN_4 - + .L82: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -1757,7 +1757,7 @@ subq $1, %rax jne .L84 ALIGN_4 - + .L85: movq C, CO1 movq A, AO @@ -1779,7 +1779,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 @@ -1801,7 +1801,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1815,7 +1815,7 @@ jle .L95 ALIGN_4 -.L91: +.L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 @@ -1927,7 +1927,7 @@ addq $4 * SIZE, CO1 subq $1, I jg .L90 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -1944,7 +1944,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 @@ -1965,7 +1965,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1979,7 +1979,7 @@ jle .L105 ALIGN_4 -.L101: +.L101: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 @@ -2068,7 +2068,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 - + .L110: testq $1, M jle .L999 @@ -2084,7 +2084,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm4 @@ -2105,7 +2105,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2119,7 +2119,7 @@ jle .L115 ALIGN_4 -.L111: +.L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 diff --git a/kernel/x86_64/gemm_kernel_4x4_penryn.S b/kernel/x86_64/gemm_kernel_4x4_penryn.S index 3179c7db7..56611e5c5 100644 --- a/kernel/x86_64/gemm_kernel_4x4_penryn.S +++ b/kernel/x86_64/gemm_kernel_4x4_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -121,7 +121,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -166,7 +166,7 @@ #endif movlps %xmm0, ALPHA - + subq $-16 * SIZE, A subq $-17 * SIZE, B @@ -180,7 +180,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -223,7 +223,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 @@ -255,7 +255,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -270,7 +270,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 @@ -543,7 +543,7 @@ BRANCH jg .L11 jmp .L20 - ALIGN_4 + ALIGN_4 .L18x: #ifndef TRMMKERNEL @@ -611,7 +611,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -631,7 +631,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 @@ -652,7 +652,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -667,7 +667,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -831,7 +831,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -851,7 +851,7 @@ leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 @@ -867,7 +867,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -882,7 +882,7 @@ jle .L35 ALIGN_4 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -1002,7 +1002,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1031,7 +1031,7 @@ movq OFFSET, %rax movq %rax, KK #endif - + movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB @@ -1055,7 +1055,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif PREFETCHB -16 * SIZE(BB) subq $-4 * SIZE, BB @@ -1076,7 +1076,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1091,7 +1091,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 @@ -1265,7 +1265,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1285,7 +1285,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1299,7 +1299,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1314,7 +1314,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -1431,7 +1431,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1451,7 +1451,7 @@ leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 2), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 @@ -1464,7 +1464,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1479,7 +1479,7 @@ jle .L75 ALIGN_4 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -1570,7 +1570,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1613,7 +1613,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -1630,7 +1630,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1645,7 +1645,7 @@ jle .L95 ALIGN_4 -.L92: +.L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -1777,7 +1777,7 @@ decq I BRANCH jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -1797,7 +1797,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1809,7 +1809,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1824,7 +1824,7 @@ jle .L105 ALIGN_4 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -1923,13 +1923,13 @@ #endif addq $2 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L110: testq $1, M BRANCH jle .L999 - + #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -1942,7 +1942,7 @@ leaq (, %rax, SIZE), %rax addq %rax, AO addq %rax, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movsd -17 * SIZE(BO), %xmm2 @@ -1955,7 +1955,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1970,7 +1970,7 @@ jle .L115 ALIGN_4 -.L112: +.L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 @@ -2041,7 +2041,7 @@ #endif movlpd %xmm8, 0 * SIZE(CO1) - ALIGN_4 + ALIGN_4 .L999: movq 0(%rsp), %rbx diff --git a/kernel/x86_64/gemm_kernel_4x4_sse2.S b/kernel/x86_64/gemm_kernel_4x4_sse2.S index 10601970f..bc317da8e 100644 --- a/kernel/x86_64/gemm_kernel_4x4_sse2.S +++ b/kernel/x86_64/gemm_kernel_4x4_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -328,11 +328,11 @@ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 -#endif +#endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -398,7 +398,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq N, J sarq $2, J # j = (n >> 2) @@ -410,17 +410,17 @@ leaq 16 * SIZE + BUFFER, BO movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $2, %rax jle .L03 ALIGN_3 - + #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) @@ -528,7 +528,7 @@ subq $1, %rax jne .L04 ALIGN_3 - + .L10: movq A, AO # aoffset = a @@ -551,7 +551,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -582,7 +582,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -964,7 +964,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_3 + ALIGN_3 .L20: testq $3, M @@ -986,7 +986,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1005,7 +1005,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1212,7 +1212,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_3 + ALIGN_3 .L30: testq $1, M @@ -1231,7 +1231,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1250,7 +1250,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1445,8 +1445,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_3 - + ALIGN_3 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1468,17 +1468,17 @@ .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $2, %rax jle .L43 ALIGN_3 - + .L42: PREFETCH 56 * SIZE(B) @@ -1536,7 +1536,7 @@ decq %rax jne .L44 ALIGN_3 - + .L50: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1559,7 +1559,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1583,7 +1583,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1794,7 +1794,7 @@ addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 - ALIGN_3 + ALIGN_3 .L60: testq $2, M @@ -1813,7 +1813,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1832,7 +1832,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1976,7 +1976,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_3 + ALIGN_3 .L70: testq $1, M @@ -1995,7 +1995,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2014,7 +2014,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2150,8 +2150,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_3 - + ALIGN_3 + .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2167,17 +2167,17 @@ .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $3, %rax jle .L83 ALIGN_3 - + .L82: PREFETCH 56 * SIZE(B) @@ -2232,7 +2232,7 @@ decq %rax jne .L84 ALIGN_3 - + .L90: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2254,7 +2254,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2275,7 +2275,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2415,7 +2415,7 @@ addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 - ALIGN_3 + ALIGN_3 .L100: testq $2, M @@ -2434,7 +2434,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2450,7 +2450,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2554,7 +2554,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif - ALIGN_3 + ALIGN_3 .L110: testq $1, M @@ -2573,7 +2573,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2589,7 +2589,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2673,7 +2673,7 @@ #endif movsd %xmm8, 0 * SIZE(CO1) ALIGN_3 - + .L999: movq %rbx, %rsp diff --git a/kernel/x86_64/gemm_kernel_4x4_sse3.S b/kernel/x86_64/gemm_kernel_4x4_sse3.S index 8cbe6ed16..ae153fe8b 100644 --- a/kernel/x86_64/gemm_kernel_4x4_sse3.S +++ b/kernel/x86_64/gemm_kernel_4x4_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -333,7 +333,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -377,27 +377,27 @@ #endif movsd %xmm0, ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif #endif - +#endif + leaq (, LDC, SIZE), LDC movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 - + .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -406,7 +406,7 @@ movq K, %rax salq $BASE_SHIFT + 2, %rax leaq (B, %rax), BB - + movq M, I sarq $2, I # i = (m >> 2) jle .L20 @@ -423,7 +423,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -456,7 +456,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -473,7 +473,7 @@ NOBRANCH je .L15 -.L1X: +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -1076,7 +1076,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1092,7 +1092,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1300,7 +1300,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1318,7 +1318,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1334,7 +1334,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1472,8 +1472,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1493,7 +1493,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1519,7 +1519,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-4 * SIZE, BB @@ -1546,7 +1546,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1766,7 +1766,7 @@ addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1784,7 +1784,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1800,7 +1800,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1944,7 +1944,7 @@ #endif addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1962,7 +1962,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1978,7 +1978,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2081,8 +2081,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2099,7 +2099,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 movq A, AO @@ -2120,7 +2120,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2142,7 +2142,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2280,7 +2280,7 @@ addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -2298,7 +2298,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2314,7 +2314,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2419,7 +2419,7 @@ #endif addq $2 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -2437,7 +2437,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2458,7 +2458,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2530,8 +2530,8 @@ #endif movsd %xmm0, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S index 4d814053f..074562804 100644 --- a/kernel/x86_64/gemm_kernel_4x8_nano.S +++ b/kernel/x86_64/gemm_kernel_4x8_nano.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi @@ -56,7 +56,7 @@ #define CO1 %r15 #define CO2 %rbp #define BB %r12 - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -90,7 +90,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -145,13 +145,13 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-32 * SIZE, A @@ -166,10 +166,10 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif leaq 32 * SIZE + BUFFER, BO - + movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm5 @@ -179,7 +179,7 @@ sarq $1, %rax jle .L03 ALIGN_4 - + .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) @@ -237,7 +237,7 @@ addq $ 8 * SIZE, B subq $-16 * SIZE, BO ALIGN_4 - + .L10: movq C, CO1 leaq (C, LDC, 4), CO2 @@ -262,7 +262,7 @@ salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif leaq (LDC, LDC, 2), %rax @@ -295,7 +295,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -568,7 +568,7 @@ addq $4 * SIZE, CO2 decq I jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -585,7 +585,7 @@ salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -599,7 +599,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -760,7 +760,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 - + .L30: testq $1, M je .L39 @@ -777,7 +777,7 @@ leaq (AO, %rax, 1), AO addq %rax, %rax leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -791,7 +791,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -972,10 +972,10 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif leaq 32 * SIZE + BUFFER, BO - + movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 movaps 8 * SIZE(B), %xmm5 @@ -985,7 +985,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) @@ -1043,7 +1043,7 @@ decq %rax jne .L45 ALIGN_4 - + .L50: movq C, CO1 leaq (C, LDC, 2), CO2 @@ -1066,7 +1066,7 @@ salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 @@ -1085,7 +1085,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1253,7 +1253,7 @@ addq $4 * SIZE, CO2 decq I jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1270,7 +1270,7 @@ salq $BASE_SHIFT + 1, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1284,7 +1284,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1399,7 +1399,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 - + .L70: testq $1, M je .L79 @@ -1415,7 +1415,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1427,7 +1427,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1558,10 +1558,10 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif leaq 32 * SIZE + BUFFER, BO - + movaps 0 * SIZE(B), %xmm1 movaps 4 * SIZE(B), %xmm3 @@ -1569,7 +1569,7 @@ sarq $2, %rax jle .L83 ALIGN_4 - + .L82: pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) @@ -1609,7 +1609,7 @@ decq %rax jne .L85 ALIGN_4 - + .L90: movq C, CO1 leaq (C, LDC), CO2 @@ -1632,7 +1632,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 @@ -1647,7 +1647,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1766,7 +1766,7 @@ addq $4 * SIZE, CO2 decq I jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -1783,7 +1783,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1794,7 +1794,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1891,7 +1891,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 ALIGN_4 - + .L110: testq $1, M je .L119 @@ -1907,7 +1907,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1918,7 +1918,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2031,10 +2031,10 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif leaq 32 * SIZE + BUFFER, BO - + movsd 0 * SIZE(B), %xmm1 movhps 2 * SIZE(B), %xmm1 @@ -2042,7 +2042,7 @@ sarq $2, %rax jle .L123 ALIGN_4 - + .L122: pshufd $0x50, %xmm1, %xmm0 movaps %xmm0, -32 * SIZE(BO) @@ -2076,7 +2076,7 @@ decq %rax jne .L125 ALIGN_4 - + .L130: movq C, CO1 movq A, AO @@ -2098,7 +2098,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movddup -32 * SIZE(BO), %xmm1 @@ -2111,7 +2111,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2208,7 +2208,7 @@ addq $4 * SIZE, CO1 decq I jg .L131 - ALIGN_4 + ALIGN_4 .L140: testq $2, M @@ -2225,7 +2225,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2236,7 +2236,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2330,7 +2330,7 @@ addq $2 * SIZE, CO1 ALIGN_4 - + .L150: testq $1, M je .L999 @@ -2346,7 +2346,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movss -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2357,7 +2357,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT diff --git a/kernel/x86_64/gemm_kernel_4x8_nehalem.S b/kernel/x86_64/gemm_kernel_4x8_nehalem.S index 5d02ac63d..549ea13b3 100644 --- a/kernel/x86_64/gemm_kernel_4x8_nehalem.S +++ b/kernel/x86_64/gemm_kernel_4x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %rbp - + #define I %r11 #define AO %rdi #define BO %rsi @@ -97,7 +97,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -143,7 +143,7 @@ unpcklps %xmm0, %xmm0 movlps %xmm0, ALPHA - + subq $-32 * SIZE, A subq $-32 * SIZE, B @@ -157,7 +157,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -203,7 +203,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif leaq (LDC, LDC, 2), %rax @@ -242,7 +242,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -257,7 +257,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 @@ -528,7 +528,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -547,7 +547,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -566,7 +566,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -581,7 +581,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -757,7 +757,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -776,7 +776,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -789,7 +789,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -804,7 +804,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -932,8 +932,8 @@ addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $8, KK @@ -980,7 +980,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -1002,7 +1002,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1017,7 +1017,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -1193,7 +1193,7 @@ decq I BRANCH jg .L41 - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -1212,7 +1212,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1226,7 +1226,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1241,7 +1241,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1354,7 +1354,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1373,7 +1373,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -1385,7 +1385,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1400,7 +1400,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -1494,8 +1494,8 @@ addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK @@ -1538,7 +1538,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -1555,7 +1555,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1570,7 +1570,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -1687,7 +1687,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -1706,7 +1706,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1720,7 +1720,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1735,7 +1735,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -1825,7 +1825,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -1844,7 +1844,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -1856,7 +1856,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1871,7 +1871,7 @@ jle .L95 ALIGN_3 -.L92: +.L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 @@ -1959,8 +1959,8 @@ addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK @@ -2002,7 +2002,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -2015,7 +2015,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2030,7 +2030,7 @@ jle .L105 ALIGN_3 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -2124,7 +2124,7 @@ decq I BRANCH jg .L101 - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -2143,7 +2143,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2154,7 +2154,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2169,7 +2169,7 @@ jle .L115 ALIGN_3 -.L112: +.L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -2255,7 +2255,7 @@ #endif addq $2 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L120: testq $1, M @@ -2274,7 +2274,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 @@ -2285,7 +2285,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2300,7 +2300,7 @@ jle .L125 ALIGN_3 -.L122: +.L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 @@ -2366,8 +2366,8 @@ #endif movss %xmm8, (CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S index becd19544..184956591 100644 --- a/kernel/x86_64/gemm_kernel_8x4_barcelona.S +++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,14 +49,14 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -295,7 +295,7 @@ movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -403,7 +403,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -458,13 +458,13 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA - + #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-32 * SIZE, A @@ -479,16 +479,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 - + .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -575,7 +575,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -600,7 +600,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 @@ -629,7 +629,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -876,7 +876,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -893,7 +893,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -913,7 +913,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1124,7 +1124,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1141,7 +1141,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -1161,7 +1161,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1368,7 +1368,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1386,7 +1386,7 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -1406,7 +1406,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1611,8 +1611,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1629,16 +1629,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 - + .L52: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -1701,7 +1701,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1724,7 +1724,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -1749,7 +1749,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1970,7 +1970,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -1988,7 +1988,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2008,7 +2008,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2154,7 +2154,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2171,7 +2171,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -2191,7 +2191,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2341,7 +2341,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -2358,7 +2358,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -2378,7 +2378,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2524,8 +2524,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2542,16 +2542,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 - + .L102: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -2608,7 +2608,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2630,7 +2630,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2654,7 +2654,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2806,7 +2806,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -2823,7 +2823,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2841,7 +2841,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2952,7 +2952,7 @@ #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -2969,7 +2969,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -2987,7 +2987,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3099,7 +3099,7 @@ #endif addq $2 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -3116,7 +3116,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -3134,7 +3134,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3221,8 +3221,8 @@ addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq %rbx, %rsp movq 0(%rsp), %rbx diff --git a/kernel/x86_64/gemm_kernel_8x4_core2.S b/kernel/x86_64/gemm_kernel_8x4_core2.S index 285d6441e..c31dc90dc 100644 --- a/kernel/x86_64/gemm_kernel_8x4_core2.S +++ b/kernel/x86_64/gemm_kernel_8x4_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -91,7 +91,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -142,7 +142,7 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA - + subq $-32 * SIZE, A subq $-32 * SIZE, B @@ -151,7 +151,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq OLD_M, M @@ -168,18 +168,18 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq 32 * SIZE + BUFFER, BO - + movaps -32 * SIZE(B), %xmm3 movq K, %rax sarq $2, %rax jle .L05 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movaps -28 * SIZE(B), %xmm7 @@ -261,7 +261,7 @@ subq $1, %rax jne .L06 ALIGN_4 - + .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -286,7 +286,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 movaps -32 * SIZE(AO), %xmm0 @@ -319,7 +319,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -333,7 +333,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 @@ -603,7 +603,7 @@ addq $8 * SIZE, CO2 subq $1, I jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -620,7 +620,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -632,7 +632,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -646,7 +646,7 @@ jle .L25 ALIGN_4 -.L21: +.L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -806,7 +806,7 @@ addq $4 * SIZE, CO2 subq $1, I ALIGN_4 - + .L30: testq $2, M jle .L40 @@ -822,7 +822,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -839,7 +839,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -853,7 +853,7 @@ jle .L35 ALIGN_4 -.L31: +.L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 @@ -1019,7 +1019,7 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1036,7 +1036,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1050,7 +1050,7 @@ jle .L45 ALIGN_4 -.L41: +.L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 @@ -1216,11 +1216,11 @@ .L51: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $3, %rax @@ -1228,7 +1228,7 @@ addq %rax, %rax ALIGN_4 - + .L52: movaps -32 * SIZE(B), %xmm3 movaps -28 * SIZE(B), %xmm7 @@ -1284,7 +1284,7 @@ subq $1, %rax jne .L54 ALIGN_4 - + .L55: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -1307,7 +1307,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1322,7 +1322,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1336,7 +1336,7 @@ jle .L65 ALIGN_4 -.L61: +.L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -1504,7 +1504,7 @@ addq $8 * SIZE, CO2 subq $1, I jg .L60 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -1521,7 +1521,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1533,7 +1533,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1547,7 +1547,7 @@ jle .L75 ALIGN_4 -.L71: +.L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -1658,7 +1658,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 - + .L80: testq $2, M jle .L90 @@ -1674,7 +1674,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1686,7 +1686,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1700,7 +1700,7 @@ jle .L85 ALIGN_4 -.L81: +.L81: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 @@ -1824,7 +1824,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1836,7 +1836,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1850,7 +1850,7 @@ jle .L95 ALIGN_4 -.L91: +.L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) @@ -1975,11 +1975,11 @@ .L101: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $4, %rax @@ -1987,7 +1987,7 @@ addq %rax, %rax ALIGN_4 - + .L102: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 @@ -2041,7 +2041,7 @@ subq $1, %rax jne .L104 ALIGN_4 - + .L105: movq C, CO1 movq A, AO @@ -2063,7 +2063,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2077,7 +2077,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2091,7 +2091,7 @@ jle .L115 ALIGN_4 -.L111: +.L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -2210,7 +2210,7 @@ addq $8 * SIZE, CO1 subq $1, I jg .L110 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -2227,7 +2227,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2239,7 +2239,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2253,7 +2253,7 @@ jle .L125 ALIGN_4 -.L121: +.L121: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -2344,7 +2344,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 ALIGN_4 - + .L130: testq $2, M jle .L140 @@ -2360,7 +2360,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2372,7 +2372,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2386,7 +2386,7 @@ jle .L135 ALIGN_4 -.L131: +.L131: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 @@ -2487,7 +2487,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2499,7 +2499,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2513,7 +2513,7 @@ jle .L145 ALIGN_4 -.L141: +.L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 diff --git a/kernel/x86_64/gemm_kernel_8x4_penryn.S b/kernel/x86_64/gemm_kernel_8x4_penryn.S index 68ca5fc08..b381de979 100644 --- a/kernel/x86_64/gemm_kernel_8x4_penryn.S +++ b/kernel/x86_64/gemm_kernel_8x4_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -97,7 +97,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -143,7 +143,7 @@ unpcklps %xmm0, %xmm0 movlps %xmm0, ALPHA - + subq $-32 * SIZE, A subq $-32 * SIZE, B @@ -157,7 +157,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -200,7 +200,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 @@ -235,7 +235,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -250,7 +250,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH -32 * SIZE(PREA) addps %xmm6, %xmm10 addps %xmm3, %xmm14 @@ -665,7 +665,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -684,7 +684,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -704,7 +704,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -719,7 +719,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: addps %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x39, %xmm2, %xmm7 @@ -890,7 +890,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -909,7 +909,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -928,7 +928,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -943,7 +943,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm0, %xmm1 @@ -1064,7 +1064,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1083,7 +1083,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1095,7 +1095,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1110,7 +1110,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 @@ -1257,7 +1257,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO -#endif +#endif prefetcht2 -32 * SIZE(BB) subq $-8 * SIZE, BB @@ -1284,7 +1284,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1299,7 +1299,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1471,7 +1471,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $4, M @@ -1490,7 +1490,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -1507,7 +1507,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1522,7 +1522,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1638,7 +1638,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L70: testq $2, M @@ -1657,7 +1657,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -1670,7 +1670,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1685,7 +1685,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1780,7 +1780,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L80: testq $1, M @@ -1799,7 +1799,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1811,7 +1811,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1826,7 +1826,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 @@ -1958,7 +1958,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1974,7 +1974,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1989,7 +1989,7 @@ jle .L95 ALIGN_3 -.L92: +.L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 @@ -2106,7 +2106,7 @@ decq I BRANCH jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $4, M @@ -2125,7 +2125,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2137,7 +2137,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2152,7 +2152,7 @@ jle .L105 ALIGN_3 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 @@ -2241,7 +2241,7 @@ #endif addq $4 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -2260,7 +2260,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -2273,7 +2273,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2288,7 +2288,7 @@ jle .L115 ALIGN_3 -.L112: +.L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 @@ -2374,7 +2374,7 @@ #endif addq $2 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L120: testq $1, M @@ -2393,7 +2393,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2405,7 +2405,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2420,7 +2420,7 @@ jle .L125 ALIGN_3 -.L122: +.L122: mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S index 218cb047c..c4ef1f809 100644 --- a/kernel/x86_64/gemm_kernel_8x4_sse.S +++ b/kernel/x86_64/gemm_kernel_8x4_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi @@ -56,7 +56,7 @@ #define CO1 %r15 #define CO2 %rbp #define BB %r12 - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -273,7 +273,7 @@ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ - movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ @@ -336,7 +336,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -381,7 +381,7 @@ #endif EMMS - + movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack @@ -393,13 +393,13 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-32 * SIZE, A @@ -414,11 +414,11 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movd 0 * SIZE(B), %mm0 movq K, %rax @@ -427,7 +427,7 @@ addq %rax, %rax ALIGN_4 - + .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) @@ -510,7 +510,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -535,7 +535,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 @@ -565,7 +565,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -937,7 +937,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -954,7 +954,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -974,7 +974,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1185,7 +1185,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1202,7 +1202,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -1222,7 +1222,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1441,7 +1441,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1459,7 +1459,7 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -1479,7 +1479,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1684,8 +1684,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1702,16 +1702,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 - + .L52: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 @@ -1767,7 +1767,7 @@ punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 - + movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) @@ -1830,7 +1830,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1853,7 +1853,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -1878,7 +1878,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2099,7 +2099,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -2117,7 +2117,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2137,7 +2137,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2283,7 +2283,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2300,7 +2300,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -2320,7 +2320,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2476,7 +2476,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -2493,7 +2493,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -2513,7 +2513,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2659,8 +2659,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2677,16 +2677,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 - + .L102: #if defined(PENTIUM4) || defined(GENERIC) @@ -2743,7 +2743,7 @@ punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 - + movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) @@ -2795,7 +2795,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2817,7 +2817,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2841,7 +2841,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2993,7 +2993,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -3010,7 +3010,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -3028,7 +3028,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3139,7 +3139,7 @@ #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -3156,7 +3156,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -3174,7 +3174,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3289,7 +3289,7 @@ #endif addq $2 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -3306,7 +3306,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -3324,7 +3324,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3411,8 +3411,8 @@ addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq %rbx, %rsp diff --git a/kernel/x86_64/gemm_kernel_8x4_sse3.S b/kernel/x86_64/gemm_kernel_8x4_sse3.S index c7954fefa..c853e46d1 100644 --- a/kernel/x86_64/gemm_kernel_8x4_sse3.S +++ b/kernel/x86_64/gemm_kernel_8x4_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -54,7 +54,7 @@ #define CO1 %r14 #define CO2 %r15 #define BB %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -328,7 +328,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -381,13 +381,13 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif leaq (, LDC, SIZE), LDC @@ -400,16 +400,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 @@ -458,7 +458,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -487,7 +487,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -518,7 +518,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -534,7 +534,7 @@ salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1 (64 * 0) KERNEL2 (64 * 0) KERNEL3 (64 * 0) @@ -860,7 +860,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -877,7 +877,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -897,7 +897,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1110,7 +1110,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1127,7 +1127,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 @@ -1144,7 +1144,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1307,7 +1307,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1324,7 +1324,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -1339,7 +1339,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1483,8 +1483,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1501,16 +1501,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L53 ALIGN_4 - + .L52: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 @@ -1556,7 +1556,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1579,7 +1579,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1601,7 +1601,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1823,7 +1823,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -1840,7 +1840,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movsldup 0 * SIZE(BO), %xmm9 @@ -1857,7 +1857,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2002,7 +2002,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2019,7 +2019,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 @@ -2034,7 +2034,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2152,7 +2152,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -2169,7 +2169,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -2184,7 +2184,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2306,8 +2306,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2323,16 +2323,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 - + .L102: movss 0 * SIZE(B), %xmm0 @@ -2385,7 +2385,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2407,7 +2407,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -2428,7 +2428,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2583,7 +2583,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -2601,7 +2601,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -2622,7 +2622,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2731,7 +2731,7 @@ #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -2748,7 +2748,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(BO), %xmm9 @@ -2765,7 +2765,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2870,7 +2870,7 @@ #endif addq $2 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -2887,7 +2887,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -2904,7 +2904,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2990,8 +2990,8 @@ addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq %rbx, %rsp movq 0(%rsp), %rbx diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S index 06a0feae9..b069f9cf7 100644 --- a/kernel/x86_64/gemm_ncopy_2.S +++ b/kernel/x86_64/gemm_ncopy_2.S @@ -86,7 +86,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 diff --git a/kernel/x86_64/gemm_ncopy_2_bulldozer.S b/kernel/x86_64/gemm_ncopy_2_bulldozer.S index 02d72f009..1911d3c74 100644 --- a/kernel/x86_64/gemm_ncopy_2_bulldozer.S +++ b/kernel/x86_64/gemm_ncopy_2_bulldozer.S @@ -73,7 +73,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S index cac647fa0..7192cecc2 100644 --- a/kernel/x86_64/gemm_ncopy_4.S +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -114,7 +114,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S index e5cbd62eb..ea39f8936 100644 --- a/kernel/x86_64/gemm_ncopy_4_opteron.S +++ b/kernel/x86_64/gemm_ncopy_4_opteron.S @@ -87,7 +87,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r15 pushq %r14 @@ -361,7 +361,7 @@ .L999: EMMS - + #ifdef WINDOWS_ABI movups 0(%rsp), %xmm6 movups 16(%rsp), %xmm7 diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S index 190cebb29..f35427b79 100644 --- a/kernel/x86_64/gemm_tcopy_2.S +++ b/kernel/x86_64/gemm_tcopy_2.S @@ -100,7 +100,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi @@ -174,7 +174,7 @@ movapd %xmm0, 0 * SIZE(BO) movapd %xmm1, 2 * SIZE(BO) #endif - + leaq (BO, M8, 2), BO addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 diff --git a/kernel/x86_64/gemm_tcopy_2_bulldozer.S b/kernel/x86_64/gemm_tcopy_2_bulldozer.S index b8d61b0ae..d7552042e 100644 --- a/kernel/x86_64/gemm_tcopy_2_bulldozer.S +++ b/kernel/x86_64/gemm_tcopy_2_bulldozer.S @@ -86,7 +86,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi @@ -202,7 +202,7 @@ leaq (BO, M8, 2), BO #endif - + addq $8 * SIZE, AO1 addq $8 * SIZE, AO2 decq I @@ -246,7 +246,7 @@ leaq (BO, M8, 2), BO #endif - + addq $4 * SIZE, AO1 addq $4 * SIZE, AO2 ALIGN_4 @@ -257,7 +257,7 @@ jle .L14 #ifndef DOUBLE vmovsd 0 * SIZE(AO1), %xmm0 - vmovsd 0 * SIZE(AO2), %xmm1 + vmovsd 0 * SIZE(AO2), %xmm1 vmovsd %xmm0, 0 * SIZE(BO) vmovsd %xmm1, 2 * SIZE(BO) @@ -268,7 +268,7 @@ vmovups %xmm0, 0 * SIZE(BO) vmovups %xmm1, 2 * SIZE(BO) #endif - + leaq (BO, M8, 2), BO addq $2 * SIZE, AO1 addq $2 * SIZE, AO2 diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S index c2308162f..ba7714b4b 100644 --- a/kernel/x86_64/gemm_tcopy_4.S +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -130,7 +130,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S index 105fe3b47..e8207ace3 100644 --- a/kernel/x86_64/gemm_tcopy_4_opteron.S +++ b/kernel/x86_64/gemm_tcopy_4_opteron.S @@ -104,7 +104,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S index 27637c53d..79e1bae1d 100644 --- a/kernel/x86_64/iamax.S +++ b/kernel/x86_64/iamax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 #define X ARG2 #define INCX ARG3 @@ -76,7 +76,7 @@ FLD (X) #ifdef USE_ABS - fabs + fabs #endif addq INCX, X decq M @@ -89,7 +89,7 @@ sarq $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -97,7 +97,7 @@ FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -108,7 +108,7 @@ FLD 1 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -119,7 +119,7 @@ FLD 2 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -130,7 +130,7 @@ FLD 3 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -141,7 +141,7 @@ FLD 4 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -152,7 +152,7 @@ FLD 5 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -163,7 +163,7 @@ FLD 6 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -174,7 +174,7 @@ FLD 7 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -199,7 +199,7 @@ .L21: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -219,12 +219,12 @@ sarq $3, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -236,7 +236,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -248,7 +248,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -260,7 +260,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -272,7 +272,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -284,7 +284,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -296,7 +296,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -308,7 +308,7 @@ FLD 0 * SIZE(X) addq INCX, X #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) @@ -331,7 +331,7 @@ .L61: FLD 0 * SIZE(X) #ifdef USE_ABS - fabs + fabs #endif fcomi FMOV %st(1), %st(0) diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index 8b7de07f2..f22e34a1d 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -47,12 +47,12 @@ #define I ARG4 #define XX %r10 #define MM %r11 - + #ifdef USE_MIN #define maxps minps #define maxss minss #endif - + #include "l1param.h" PROLOGUE @@ -127,7 +127,7 @@ sarq $4, I jle .L15 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -193,7 +193,7 @@ #endif maxps %xmm6, %xmm2 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L17: testq $2, M @@ -206,7 +206,7 @@ #endif maxps %xmm7, %xmm3 addq $2 * SIZE, X - + .L18: testq $1, M je .L20 @@ -276,7 +276,7 @@ sarq $3, I jle .L25 ALIGN_4 - + .L23: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -384,7 +384,7 @@ incq RET comiss %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L26: testq $2, M @@ -404,7 +404,7 @@ comiss %xmm0, %xmm2 je .L999 ALIGN_3 - + .L27: incq RET jmp .L999 @@ -416,7 +416,7 @@ sarq $4, I jle .L35 ALIGN_4 - + .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -490,7 +490,7 @@ #endif maxps %xmm6, %xmm2 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L37: testq $2, M @@ -503,7 +503,7 @@ #endif maxps %xmm7, %xmm3 addq $2 * SIZE, X - + .L38: testq $1, M je .L40 @@ -535,7 +535,7 @@ sarq $3, I jle .L45 ALIGN_4 - + .L43: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -645,7 +645,7 @@ incq RET comiss %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L46: testq $2, M @@ -665,7 +665,7 @@ comiss %xmm0, %xmm2 je .L999 ALIGN_3 - + .L47: incq RET jmp .L999 @@ -676,7 +676,7 @@ sarq $3, I jle .L85 ALIGN_4 - + .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -776,7 +776,7 @@ andps %xmm15, %xmm7 #endif maxss %xmm7, %xmm3 - ALIGN_3 + ALIGN_3 .L86: testq $2, M @@ -796,7 +796,7 @@ #endif maxss %xmm5, %xmm1 ALIGN_3 - + .L87: testq $1, M je .L90 @@ -822,7 +822,7 @@ sarq $3, I jle .L95 ALIGN_4 - + .L93: movss 0 * SIZE(X), %xmm1 addq INCX, X @@ -985,7 +985,7 @@ incq RET comiss %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L96: testq $2, M @@ -1006,7 +1006,7 @@ comiss %xmm0, %xmm2 je .L999 ALIGN_3 - + .L97: incq RET ALIGN_3 diff --git a/kernel/x86_64/iamax_sse2.S b/kernel/x86_64/iamax_sse2.S index c17a81ab9..6808f191b 100644 --- a/kernel/x86_64/iamax_sse2.S +++ b/kernel/x86_64/iamax_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -47,14 +47,14 @@ #define I ARG4 #define XX %r10 #define MM %r11 - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd #endif #include "l1param.h" - + PROLOGUE PROFCODE @@ -114,7 +114,7 @@ sarq $4, I jle .L15 ALIGN_4 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -226,7 +226,7 @@ #endif maxpd %xmm5, %xmm1 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L17: testq $2, M @@ -238,7 +238,7 @@ #endif maxpd %xmm6, %xmm2 addq $2 * SIZE, X - + .L18: testq $1, M je .L20 @@ -284,7 +284,7 @@ sarq $3, I jle .L25 ALIGN_4 - + .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -407,7 +407,7 @@ incq RET comisd %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L27: testq $2, M @@ -427,7 +427,7 @@ comisd %xmm0, %xmm2 je .L999 ALIGN_3 - + .L28: incq RET jmp .L999 @@ -566,7 +566,7 @@ #endif maxpd %xmm5, %xmm1 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L57: testq $2, M @@ -579,7 +579,7 @@ #endif maxpd %xmm6, %xmm2 addq $2 * SIZE, X - + .L58: testq $1, M je .L60 @@ -608,7 +608,7 @@ sarq $3, I jle .L65 ALIGN_4 - + .L62: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -735,7 +735,7 @@ incq RET comisd %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L67: testq $2, M @@ -755,7 +755,7 @@ comisd %xmm0, %xmm2 je .L999 ALIGN_3 - + .L68: incq RET jmp .L999 @@ -766,7 +766,7 @@ sarq $4, I jle .L85 ALIGN_4 - + .L81: movsd 0 * SIZE(X), %xmm4 addq INCX, X @@ -909,7 +909,7 @@ andpd %xmm15, %xmm5 #endif maxpd %xmm5, %xmm1 - ALIGN_3 + ALIGN_3 .L87: testq $2, M @@ -924,7 +924,7 @@ #endif maxpd %xmm6, %xmm2 ALIGN_3 - + .L88: testq $1, M je .L90 @@ -960,7 +960,7 @@ sarq $3, I jle .L95 ALIGN_4 - + .L92: movsd 0 * SIZE(X), %xmm1 addq INCX, X @@ -1101,7 +1101,7 @@ incq RET comisd %xmm0, %xmm4 je .L999 - ALIGN_3 + ALIGN_3 .L97: testq $2, M @@ -1122,7 +1122,7 @@ comisd %xmm0, %xmm2 je .L999 ALIGN_3 - + .L98: incq RET ALIGN_3 diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S index a77b06df9..c066acd62 100644 --- a/kernel/x86_64/izamax.S +++ b/kernel/x86_64/izamax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 #define X ARG2 #define INCX ARG3 @@ -75,9 +75,9 @@ movq $1, RET FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) addq INCX, X decq M @@ -90,16 +90,16 @@ sarq $2, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -109,9 +109,9 @@ incq NUM FLD 2 * SIZE(X) - fabs + fabs FLD 3 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -121,9 +121,9 @@ incq NUM FLD 4 * SIZE(X) - fabs + fabs FLD 5 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -133,9 +133,9 @@ incq NUM FLD 6 * SIZE(X) - fabs + fabs FLD 7 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -158,9 +158,9 @@ .L21: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -180,12 +180,12 @@ sarq $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -196,9 +196,9 @@ incq NUM FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -209,9 +209,9 @@ incq NUM FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -222,9 +222,9 @@ incq NUM FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -247,9 +247,9 @@ .L61: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) diff --git a/kernel/x86_64/izamax_sse.S b/kernel/x86_64/izamax_sse.S index 2dfeb93ea..e273b8cc6 100644 --- a/kernel/x86_64/izamax_sse.S +++ b/kernel/x86_64/izamax_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -47,12 +47,12 @@ #define I ARG4 #define XX %r10 #define MM %r11 - + #ifdef USE_MIN #define maxps minps #define maxss minss #endif - + #include "l1param.h" PROLOGUE @@ -91,7 +91,7 @@ sarq $3, I jle .L35 ALIGN_4 - + .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -172,7 +172,7 @@ maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L37: testq $1, M @@ -203,7 +203,7 @@ sarq $2, I jle .L45 ALIGN_4 - + .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -301,7 +301,7 @@ incq RET comiss %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L47: incq RET @@ -313,7 +313,7 @@ sarq $3, I jle .L75 ALIGN_4 - + .L71: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -384,7 +384,7 @@ andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 - ALIGN_3 + ALIGN_3 .L76: testq $2, M @@ -405,7 +405,7 @@ maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 ALIGN_3 - + .L77: testq $1, M je .L80 @@ -435,7 +435,7 @@ sarq $2, I jle .L85 ALIGN_4 - + .L81: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -540,7 +540,7 @@ incq RET comiss %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L87: incq RET diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S index 404608256..c656a652d 100644 --- a/kernel/x86_64/izamax_sse2.S +++ b/kernel/x86_64/izamax_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -47,7 +47,7 @@ #define I ARG4 #define XX %r10 #define MM %r11 - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd @@ -188,7 +188,7 @@ andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm0 - ALIGN_3 + ALIGN_3 .L27: testq $1, M @@ -323,7 +323,7 @@ incq RET comisd %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L36: incq RET @@ -335,7 +335,7 @@ sarq $3, I jle .L65 ALIGN_4 - + .L61: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -437,7 +437,7 @@ andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm2 - ALIGN_3 + ALIGN_3 .L67: testq $1, M @@ -583,7 +583,7 @@ incq RET comisd %xmm0, %xmm3 je .L999 - ALIGN_3 + ALIGN_3 .L76: incq RET diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index d375e8e60..e9be1262a 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -49,7 +49,7 @@ PROLOGUE PROFCODE - + fldz testq M, M jle .L999 @@ -68,7 +68,7 @@ sarq $3, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -129,7 +129,7 @@ sarq $3, I jle .L60 ALIGN_4 - + .L50: FLD (X) addq INCX, X diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S index 37762abcb..33b1ee496 100644 --- a/kernel/x86_64/nrm2_sse.S +++ b/kernel/x86_64/nrm2_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -49,7 +49,7 @@ PROLOGUE PROFCODE - + SAVEREGISTERS pxor %xmm0, %xmm0 @@ -67,7 +67,7 @@ testq $SIZE, X je .L05 - + movss 0 * SIZE(X), %xmm4 cvtss2sd %xmm4, %xmm6 mulsd %xmm6, %xmm6 @@ -81,7 +81,7 @@ movq M, I sarq $3, I jle .L14 - + movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 @@ -181,7 +181,7 @@ sarq $3, I jle .L44 ALIGN_4 - + .L41: movss (X), %xmm4 addq INCX, X diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S index c958fc57d..a48a04fdd 100644 --- a/kernel/x86_64/qdot.S +++ b/kernel/x86_64/qdot.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S index 9db145b9f..99db3961f 100644 --- a/kernel/x86_64/qgemm_kernel_2x2.S +++ b/kernel/x86_64/qgemm_kernel_2x2.S @@ -46,7 +46,7 @@ #define B ARG5 #define C ARG6 #define LDC %r10 - + #define I %r12 #define J %r13 #define AO %r14 @@ -73,7 +73,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -89,10 +89,10 @@ negq %rax movq %rax, KK #endif - + addq $8 * SIZE, A addq $8 * SIZE, B - + salq $BASE_SHIFT, LDC movq N, %rax @@ -105,7 +105,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq A, AO @@ -128,7 +128,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif fldz fldz @@ -148,7 +148,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -174,7 +174,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -192,7 +192,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -212,7 +212,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -230,7 +230,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -266,7 +266,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -347,7 +347,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 2), BO -#endif +#endif fldz fldz @@ -357,7 +357,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -496,13 +496,13 @@ .L30: movq N, %rax - testq $1, %rax + testq $1, %rax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq A, AO @@ -524,7 +524,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq ( B, %rax, 1), BO -#endif +#endif fldz fldz @@ -540,7 +540,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -681,7 +681,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 1), BO -#endif +#endif fldz @@ -690,7 +690,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S index 28415ecb1..630d03ffb 100644 --- a/kernel/x86_64/qgemv_n.S +++ b/kernel/x86_64/qgemv_n.S @@ -41,9 +41,9 @@ #include "l2param.h" #define P 32 - + #define STACKSIZE 80 - + #define ALPHA 8 + STACKSIZE(%rsp) #define OLD_INCX 24 + STACKSIZE(%rsp) #define OLD_Y 32 + STACKSIZE(%rsp) @@ -71,7 +71,7 @@ #define XP %r15 /* #define BUFFER %r15 */ #define MIN_N %rbx - + PROLOGUE PROFCODE @@ -175,7 +175,7 @@ ALIGN_2 .L48: - movq A, A1 # a_offset = a + movq A, A1 # a_offset = a fldz addq $4 * SIZE, A # a += 4 fldz @@ -239,7 +239,7 @@ FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(2) # ct1 += at1 - + FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) fmul %st(1), %st # at1 *= bt1 faddp %st, %st(3) # ct2 += at1 diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S index 9402f21a9..d7c9cd2a5 100644 --- a/kernel/x86_64/qgemv_t.S +++ b/kernel/x86_64/qgemv_t.S @@ -42,7 +42,7 @@ #define STACKSIZE 80 #define P 4096 - + #define ALPHA 8 + STACKSIZE(%rsp) #define OLD_INCX 24 + STACKSIZE(%rsp) #define OLD_Y 32 + STACKSIZE(%rsp) @@ -70,7 +70,7 @@ #define X1 %r13 #define Y1 %r14 #define MIN_M %rbx - + PROLOGUE PROFCODE diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S index 7093ebae5..536042e65 100644 --- a/kernel/x86_64/qtrsm_kernel_LN_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S @@ -46,7 +46,7 @@ #define B ARG5 #define C ARG6 #define LDC %r10 - + #define I %r12 #define J %r13 #define AO %r14 @@ -73,7 +73,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -89,10 +89,10 @@ negq %rax movq %rax, KK #endif - + addq $8 * SIZE, A addq $8 * SIZE, B - + salq $BASE_SHIFT, LDC #ifdef LN @@ -118,7 +118,7 @@ movq OFFSET, %rax negq %rax movq %rax, KK -#endif +#endif #ifdef RT movq N, %rax @@ -160,7 +160,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -187,7 +187,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif fldz fldz @@ -410,7 +410,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif fldz fldz @@ -447,7 +447,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -465,7 +465,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -485,7 +485,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -503,7 +503,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -540,7 +540,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -759,7 +759,7 @@ .L30: movq N, %rax - testq $1, %rax + testq $1, %rax je .L999 #if defined(LT) || defined(RN) @@ -787,7 +787,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -814,7 +814,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif fldz @@ -989,7 +989,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif fldz fldz diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S index d2a05a11e..6e94976c5 100644 --- a/kernel/x86_64/qtrsm_kernel_LT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S @@ -46,7 +46,7 @@ #define B ARG5 #define C ARG6 #define LDC %r10 - + #define I %r12 #define J %r13 #define AO %r14 @@ -73,7 +73,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -89,10 +89,10 @@ negq %rax movq %rax, KK #endif - + addq $8 * SIZE, A addq $8 * SIZE, B - + salq $BASE_SHIFT, LDC #ifdef LN @@ -118,7 +118,7 @@ movq OFFSET, %rax negq %rax movq %rax, KK -#endif +#endif #ifdef RT movq N, %rax @@ -160,7 +160,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -187,7 +187,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif fldz fldz @@ -224,7 +224,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -242,7 +242,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -262,7 +262,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -280,7 +280,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -317,7 +317,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -532,7 +532,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif fldz fldz @@ -759,7 +759,7 @@ .L30: movq N, %rax - testq $1, %rax + testq $1, %rax je .L999 #if defined(LT) || defined(RN) @@ -787,7 +787,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -814,7 +814,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif fldz fldz @@ -1047,7 +1047,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif fldz diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S index 288aa0778..caa7de14a 100644 --- a/kernel/x86_64/qtrsm_kernel_RT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S @@ -46,7 +46,7 @@ #define B ARG5 #define C ARG6 #define LDC %r10 - + #define I %r12 #define J %r13 #define AO %r14 @@ -73,7 +73,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -89,10 +89,10 @@ negq %rax movq %rax, KK #endif - + addq $8 * SIZE, A addq $8 * SIZE, B - + salq $BASE_SHIFT, LDC #ifdef LN @@ -118,7 +118,7 @@ movq OFFSET, %rax negq %rax movq %rax, KK -#endif +#endif #ifdef RT movq N, %rax @@ -127,7 +127,7 @@ #endif movq N, %rax - testq $1, %rax + testq $1, %rax je .L30 #if defined(LT) || defined(RN) @@ -155,7 +155,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -182,7 +182,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif fldz fldz @@ -415,7 +415,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif fldz @@ -624,7 +624,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -651,7 +651,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif fldz fldz @@ -688,7 +688,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -706,7 +706,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -726,7 +726,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -744,7 +744,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -781,7 +781,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -996,7 +996,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif fldz fldz diff --git a/kernel/x86_64/rot.S b/kernel/x86_64/rot.S index 05e5aebb3..6b2ad7fd8 100644 --- a/kernel/x86_64/rot.S +++ b/kernel/x86_64/rot.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N ARG1 #define X ARG2 #define INCX ARG3 @@ -80,7 +80,7 @@ sarq $2, I jle .L15 ALIGN_4 - + .L10: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -212,7 +212,7 @@ sarq $2, I jle .L55 ALIGN_4 - + .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) diff --git a/kernel/x86_64/rot_sse.S b/kernel/x86_64/rot_sse.S index cb7e1b317..6e37292ed 100644 --- a/kernel/x86_64/rot_sse.S +++ b/kernel/x86_64/rot_sse.S @@ -65,8 +65,8 @@ SAVEREGISTERS - leaq (, INCX, SIZE), INCX - leaq (, INCY, SIZE), INCY + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY pshufd $0x0, %xmm0, C pshufd $0x0, %xmm1, S @@ -153,7 +153,7 @@ movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm8 movaps 12 * SIZE(X), %xmm10 - + decq %rax jle .L12 ALIGN_3 diff --git a/kernel/x86_64/rot_sse2.S b/kernel/x86_64/rot_sse2.S index 502940324..aa5852cac 100644 --- a/kernel/x86_64/rot_sse2.S +++ b/kernel/x86_64/rot_sse2.S @@ -65,8 +65,8 @@ SAVEREGISTERS - leaq (, INCX, SIZE), INCX - leaq (, INCY, SIZE), INCY + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY pshufd $0x44, %xmm0, C pshufd $0x44, %xmm1, S @@ -123,7 +123,7 @@ movaps 2 * SIZE(X), %xmm2 movaps 4 * SIZE(X), %xmm8 movaps 6 * SIZE(X), %xmm10 - + decq %rax jle .L12 ALIGN_3 diff --git a/kernel/x86_64/scal_atom.S b/kernel/x86_64/scal_atom.S index ecc687c02..11350ea19 100644 --- a/kernel/x86_64/scal_atom.S +++ b/kernel/x86_64/scal_atom.S @@ -61,11 +61,11 @@ movq 40(%rsp), X movq 48(%rsp), INCX - movaps %xmm3, %xmm0 + movaps %xmm3, %xmm0 #endif SAVEREGISTERS - + testq M, M jle .L999 @@ -218,7 +218,7 @@ mulsd %xmm0, %xmm3 mulsd %xmm0, %xmm4 - decq I + decq I jle .L112 ALIGN_4 diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S index 9c8dd9dc2..b92688d9e 100644 --- a/kernel/x86_64/scal_sse.S +++ b/kernel/x86_64/scal_sse.S @@ -61,11 +61,11 @@ movq 40(%rsp), X movq 48(%rsp), INCX - movaps %xmm3, %xmm0 + movaps %xmm3, %xmm0 #endif SAVEREGISTERS - + testq M, M jle .L999 @@ -285,7 +285,7 @@ movaps %xmm0, %xmm8 mulps -4 * SIZE(X), %xmm8 - decq I + decq I jle .L112 ALIGN_4 @@ -341,9 +341,9 @@ movaps %xmm6, -12 * SIZE(X) movaps %xmm7, -8 * SIZE(X) movaps %xmm8, -4 * SIZE(X) - + #else - + movaps -32 * SIZE(X), %xmm1 movaps -28 * SIZE(X), %xmm2 movaps -24 * SIZE(X), %xmm3 @@ -352,7 +352,7 @@ movaps -12 * SIZE(X), %xmm6 movaps -8 * SIZE(X), %xmm7 movaps -4 * SIZE(X), %xmm8 - decq I + decq I jle .L112 ALIGN_4 diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S index 3823b1fc9..20dd7fa2d 100644 --- a/kernel/x86_64/scal_sse2.S +++ b/kernel/x86_64/scal_sse2.S @@ -61,11 +61,11 @@ movq 40(%rsp), X movq 48(%rsp), INCX - movaps %xmm3, %xmm0 + movaps %xmm3, %xmm0 #endif SAVEREGISTERS - + testq M, M jle .L999 @@ -75,7 +75,7 @@ comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO jp .L100 # For Alpha = NaN - + /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 @@ -270,7 +270,7 @@ movaps %xmm0, %xmm8 mulpd -2 * SIZE(X), %xmm8 - decq I + decq I jle .L112 ALIGN_4 @@ -336,7 +336,7 @@ movaps -4 * SIZE(X), %xmm7 movaps -2 * SIZE(X), %xmm8 - decq I + decq I jle .L112 ALIGN_4 diff --git a/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S b/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S index 268d3ae7e..2194fd6c1 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S +++ b/kernel/x86_64/sgemm_kernel_8x4_bulldozer.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,13 +49,13 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -226,7 +226,7 @@ vmovups 100 * SIZE(BO, %rax, 8), %xmm3 ;\ vmovaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -334,7 +334,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -389,13 +389,13 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, ALPHA - + #ifdef TRMMKERNEL movsd %xmm12, OFFSET movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-32 * SIZE, A @@ -410,22 +410,22 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 - + .L02: - prefetcht0 192(B) - prefetcht0 256(B) - prefetcht0 192(BO) - prefetcht0 256(BO) + prefetcht0 192(B) + prefetcht0 256(B) + prefetcht0 192(BO) + prefetcht0 256(BO) movaps 0 * SIZE(B), %xmm3 movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -506,7 +506,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -530,7 +530,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 @@ -558,7 +558,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -745,7 +745,7 @@ vmulps %xmm7, %xmm8, %xmm8 vmulps %xmm7, %xmm9, %xmm9 - vmulps %xmm7, %xmm10, %xmm10 + vmulps %xmm7, %xmm10, %xmm10 vmulps %xmm7, %xmm11, %xmm11 vmulps %xmm7, %xmm12,%xmm12 @@ -786,7 +786,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -803,7 +803,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -823,7 +823,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1024,7 +1024,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1041,7 +1041,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -1061,7 +1061,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1265,7 +1265,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1283,7 +1283,7 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -1303,7 +1303,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1505,8 +1505,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1523,16 +1523,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 - + .L52: movaps 0 * SIZE(B), %xmm3 @@ -1592,7 +1592,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1615,7 +1615,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -1640,7 +1640,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1845,7 +1845,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -1863,7 +1863,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -1883,7 +1883,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2024,7 +2024,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2041,7 +2041,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -2061,7 +2061,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2208,7 +2208,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -2225,7 +2225,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -2245,7 +2245,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2388,8 +2388,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2406,16 +2406,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 - + .L102: @@ -2469,7 +2469,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2491,7 +2491,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2515,7 +2515,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2653,7 +2653,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -2670,7 +2670,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2688,7 +2688,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2792,7 +2792,7 @@ #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -2809,7 +2809,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -2827,7 +2827,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2936,7 +2936,7 @@ #endif addq $2 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -2953,7 +2953,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -2971,7 +2971,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3055,8 +3055,8 @@ addss %xmm8, %xmm0 #endif movss %xmm0, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq %rbx, %rsp movq 0(%rsp), %rbx diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 20ddcaa8e..fb67dee9c 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -90,7 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PREFETCH0 prefetcht0 #define PREFETCH1 prefetcht0 #define PREFETCH2 prefetcht2 -#define PRESIZE 80 +#define PRESIZE 80 #define xvec0 %xmm0 #define xvec1 %xmm1 @@ -140,10 +140,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OR orq #define JNE jne #define JMP jmp -#define NOP +#define NOP #define XOR xorpd #undef MOVQ -#define MOVQ movq +#define MOVQ movq #define XOR_SY vxorps #define XOR_SX vxorps @@ -167,7 +167,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define EDUP_SX vmovsldup #define ODUP_SX vmovshdup -#define ADD_SY vaddps +#define ADD_SY vaddps #define ADD_SX vaddps #define ADD1_DY vaddpd @@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VPERMILP_SX vpermilps #define BROAD_SY vbroadcastss -#define BROAD_SX vbroadcastss +#define BROAD_SX vbroadcastss #define MOV_SY vmovaps #define MOV_SX vmovaps @@ -222,7 +222,7 @@ movq %r15, 40(%rsp); movq ARG1, old_bm movq ARG2, old_bn movq ARG3, old_bk - movq OLD_A, ba + movq OLD_A, ba movq OLD_B, bb movq OLD_C, C movq old_ldc, ldc @@ -275,7 +275,7 @@ ALIGN_4; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -317,7 +317,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $8, %rax; #endif MOVQ %rax, kkk; @@ -328,7 +328,7 @@ ALIGN_4; .L2_bodyB:; # Computing kernel -#### Unroll times 1 #### +#### Unroll times 1 #### PREFETCH0 PRESIZE*SIZE(ptrba); MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; @@ -359,7 +359,7 @@ MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; -#### Unroll times 2 #### +#### Unroll times 2 #### MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 8*SIZE(ptrbb), yvec2 @@ -389,7 +389,7 @@ MUL_SY yvec1, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; -#### Unroll times 3 #### +#### Unroll times 3 #### PREFETCH0 (PRESIZE+16)*SIZE(ptrba); MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; @@ -421,7 +421,7 @@ MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; -#### Unroll times 4 #### +#### Unroll times 4 #### MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 24*SIZE(ptrbb), yvec2 @@ -464,7 +464,7 @@ TEST $2, kkk; JLE .L3_loopE; ALIGN_4 .L3_loobB: -#### Unroll times 1 #### +#### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 0*SIZE(ptrbb), yvec2 @@ -495,7 +495,7 @@ MUL_SY yvec0, yvec5, yvec7; ADD_SY yvec10, yvec6, yvec10; ADD_SY yvec8, yvec7, yvec8; -#### Unroll times 2 #### +#### Unroll times 2 #### MUL_SY yvec1, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 8*SIZE(ptrbb), yvec2 @@ -534,7 +534,7 @@ TEST $1, kkk; JLE .L4_loopE; ALIGN_4 .L4_loopB:; -#### Unroll times 1 #### +#### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; ODUP_SY 0*SIZE(ptrbb), yvec2 @@ -802,8 +802,8 @@ JLE .L5_loopE; ALIGN_4 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) -MOVQ bb, ptrbb; -#else +MOVQ bb, ptrbb; +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -832,7 +832,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $8, %rax; #endif MOVQ %rax, kkk; @@ -872,7 +872,7 @@ ODUP_SX 8*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; -#### Unroll time 2 #### +#### Unroll time 2 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; @@ -902,7 +902,7 @@ ODUP_SX 16*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; -#### Unroll time 3 #### +#### Unroll time 3 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec0, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; @@ -933,7 +933,7 @@ MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; ADDQ $16*SIZE, ptrba; -#### Unroll time 4 #### +#### Unroll time 4 #### SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; ADD_SX xvec2, xvec15, xvec15; @@ -1005,7 +1005,7 @@ ODUP_SX 8*SIZE(ptrbb), xvec3; MUL_SX xvec0, xvec5, xvec5; ADD_SX xvec5, xvec8, xvec8; -#### Unroll time 2 #### +#### Unroll time 2 #### ADDQ $8*SIZE, ptrba; SHUF_SX $0x4e, xvec2, xvec4; MUL_SX xvec1, xvec2, xvec2; @@ -1099,7 +1099,7 @@ REVS_SX $0xe4, xvec7, xvec9, xvec9; MOV_SX xvec10, xvec7; REVS_SX $0xe4, xvec8, xvec10, xvec10; REVS_SX $0xe4, xvec7, xvec8, xvec8; -#### Testing Alignment #### +#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; @@ -1200,8 +1200,8 @@ JLE .L6_loopE; ALIGN_4 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) -MOVQ bb, ptrbb; -#else +MOVQ bb, ptrbb; +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -1220,7 +1220,7 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; @@ -1419,7 +1419,7 @@ ALIGN_4 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; @@ -1440,7 +1440,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $8, %rax; #endif MOVQ %rax, kkk; @@ -1614,7 +1614,7 @@ ALIGN_4 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -1643,7 +1643,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -1907,7 +1907,7 @@ MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec7, xvec10, xvec10; MUL_SX xvec7, xvec9, xvec9; MUL_SX xvec7, xvec8, xvec8; -#### Writing Back #### +#### Writing Back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; @@ -1971,8 +1971,8 @@ JLE .L22_loopE; ALIGN_4 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) -MOVQ bb, ptrbb; -#else +MOVQ bb, ptrbb; +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -1994,7 +1994,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -2188,8 +2188,8 @@ JLE .L23_loopE; ALIGN_4 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) -MOVQ bb, ptrbb; -#else +MOVQ bb, ptrbb; +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -2205,7 +2205,7 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; @@ -2342,7 +2342,7 @@ ALIGN_4 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; @@ -2361,7 +2361,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -2489,7 +2489,7 @@ ALIGN_4 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -2507,11 +2507,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -2721,7 +2721,7 @@ ALIGN_4 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; @@ -2737,11 +2737,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -2873,7 +2873,7 @@ ALIGN_4 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (,%rax, SIZE), %rax; @@ -2891,11 +2891,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -3017,7 +3017,7 @@ ALIGN_4 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -3033,13 +3033,13 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $2, %rax; -#endif +#endif MOVQ %rax, kkk; #endif SARQ $2, k; @@ -3136,7 +3136,7 @@ addq $1*SIZE, ptrba; addq $2*SIZE, ptrbb .L343_loopE: -#### Writing back #### +#### Writing back #### movss MEMALPHA, xvec7; mulss xvec7, xvec15; mulss xvec7, xvec14; @@ -3186,7 +3186,7 @@ ALIGN_4 .L41_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax; @@ -3201,11 +3201,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -3333,11 +3333,11 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk @@ -3437,13 +3437,13 @@ ALIGN_4 .L43_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax LEAQ (, %rax, SIZE), %rax LEAQ (ptrba, %rax, 2), ptrba ADDQ %rax, ptrbb; -#endif +#endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL @@ -3452,7 +3452,7 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; @@ -3576,9 +3576,9 @@ TEST $1, bm; JLE .L44_loopE; ALIGN_4 .L44_bodyB: -#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; -#else +#else MOVQ bb, ptrbb; MOVQ kk, %rax; LEAQ (, %rax, SIZE), %rax; @@ -3592,7 +3592,7 @@ MOVQ bk, k; MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; -#else +#else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; diff --git a/kernel/x86_64/sgemv_n.S b/kernel/x86_64/sgemv_n.S index 7641056fb..8f64fe5e2 100644 --- a/kernel/x86_64/sgemv_n.S +++ b/kernel/x86_64/sgemv_n.S @@ -48,7 +48,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx @@ -66,7 +66,7 @@ #else #define STACKSIZE 288 - + #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) @@ -210,9 +210,9 @@ jle .L999 subq $-32 * SIZE, A - + movq BUFFER, Y1 - + pxor %xmm0, %xmm0 movq M, %rax @@ -301,7 +301,7 @@ testq $SIZE, A1 je .L1X - + movss -32 * SIZE(A1), %xmm4 movss -32 * SIZE(A1, LDA, 1), %xmm5 movss -32 * SIZE(A1, LDA, 2), %xmm6 @@ -907,7 +907,7 @@ testq $SIZE, A1 je .L2X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -1301,7 +1301,7 @@ testq $SIZE, A1 je .L3X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -1628,7 +1628,7 @@ testq $SIZE, A1 je .L4X - + movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 @@ -1885,7 +1885,7 @@ testq $SIZE, A1 je .L5X - + movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 @@ -2066,7 +2066,7 @@ .L100: testq $2 * SIZE - 1, LDA jne .L200 - + cmpq $4, N jl .L110 ALIGN_3 @@ -2105,7 +2105,7 @@ testq $SIZE, A1 je .L10X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -2513,7 +2513,7 @@ testq $SIZE, A1 je .L11X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -2846,7 +2846,7 @@ testq $SIZE, A1 je .L12X - + movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 @@ -3116,7 +3116,7 @@ testq $SIZE, A1 je .L13X - + movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 @@ -3332,7 +3332,7 @@ testq $SIZE, A1 je .L20X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -3776,7 +3776,7 @@ testq $SIZE, A1 je .L21X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -4139,7 +4139,7 @@ testq $SIZE, A1 je .L22X - + movss -32 * SIZE(Y1), %xmm9 movss -32 * SIZE(A1), %xmm0 @@ -4423,7 +4423,7 @@ testq $SIZE, A1 je .L23X - + movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 @@ -4637,7 +4637,7 @@ testq $SIZE, A1 je .L30X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -5080,7 +5080,7 @@ testq $SIZE, A1 je .L31X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(A1, LDA), %xmm1 movss -32 * SIZE(A2), %xmm2 @@ -5439,7 +5439,7 @@ testq $SIZE, A1 je .L32X - + movss -32 * SIZE(Y1), %xmm9 movss -32 * SIZE(A1), %xmm0 @@ -5722,7 +5722,7 @@ testq $SIZE, A1 je .L33X - + movss -32 * SIZE(Y1), %xmm8 movss -32 * SIZE(A1), %xmm0 diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index c2cb6b944..33e2fa86c 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -43,11 +43,11 @@ #undef GEMV_UNROLL #define GEMV_UNROLL 4 #endif - + #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_A %rcx @@ -64,7 +64,7 @@ #else #define STACKSIZE 288 - + #define OLD_M %rcx #define OLD_N %rdx #define OLD_A 40 + STACKSIZE(%rsp) @@ -161,7 +161,7 @@ subq M,MMM jge .L00t ALIGN_4 - + movq MMM,%rax addq M,%rax jle .L999x @@ -213,7 +213,7 @@ #endif movq BUFFER, X1 - + movq M, I sarq $3, I jle .L05 @@ -307,7 +307,7 @@ testq $SIZE, A1 je .L1X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -991,7 +991,7 @@ testq $SIZE, A1 je .L2X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -1018,26 +1018,26 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -1279,29 +1279,29 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -1415,7 +1415,7 @@ testq $SIZE, A1 je .L3X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -1442,26 +1442,26 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -1665,29 +1665,29 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif addq $2 * SIZE, A1 addq $2 * SIZE, A2 @@ -1782,7 +1782,7 @@ testq $SIZE, A1 je .L4X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -1803,14 +1803,14 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -1972,17 +1972,17 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -2054,7 +2054,7 @@ testq $SIZE, A1 je .L5X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -2072,7 +2072,7 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 @@ -2194,11 +2194,11 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 @@ -2270,7 +2270,7 @@ testq $SIZE, A1 je .L10X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -2297,26 +2297,26 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -2582,29 +2582,29 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -2715,7 +2715,7 @@ testq $SIZE, A1 je .L11X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -2739,20 +2739,20 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 @@ -2964,23 +2964,23 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 @@ -3075,7 +3075,7 @@ testq $SIZE, A1 je .L12X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -3096,14 +3096,14 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -3277,17 +3277,17 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -3358,7 +3358,7 @@ testq $SIZE, A1 je .L13X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -3376,7 +3376,7 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 @@ -3497,11 +3497,11 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 @@ -3571,7 +3571,7 @@ testq $SIZE, A1 je .L20X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -3598,26 +3598,26 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -3927,29 +3927,29 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -4060,7 +4060,7 @@ testq $SIZE, A1 je .L21X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -4084,20 +4084,20 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 @@ -4364,23 +4364,23 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 @@ -4478,7 +4478,7 @@ testq $SIZE, A1 je .L22X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -4499,14 +4499,14 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -4693,17 +4693,17 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -4791,7 +4791,7 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 @@ -4912,11 +4912,11 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 @@ -4983,7 +4983,7 @@ testq $SIZE, A1 je .L30X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -5010,26 +5010,26 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -5339,29 +5339,29 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 #ifdef movsd xorps %xmm3, %xmm3 -#endif +#endif movsd -32 * SIZE(A2, LDA), %xmm3 mulps %xmm4, %xmm3 addps %xmm3, %xmm11 @@ -5475,7 +5475,7 @@ testq $SIZE, A1 je .L31X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -5499,20 +5499,20 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 @@ -5774,23 +5774,23 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A1, LDA), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 #ifdef movsd xorps %xmm2, %xmm2 -#endif +#endif movsd -32 * SIZE(A2), %xmm2 mulps %xmm4, %xmm2 addps %xmm2, %xmm10 @@ -5884,7 +5884,7 @@ testq $SIZE, A1 je .L32X - + movss -32 * SIZE(A1), %xmm0 movss -32 * SIZE(X1), %xmm4 mulss %xmm4, %xmm0 @@ -5905,14 +5905,14 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -6098,17 +6098,17 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 #ifdef movsd xorps %xmm1, %xmm1 -#endif +#endif movsd -32 * SIZE(A2), %xmm1 mulps %xmm4, %xmm1 addps %xmm1, %xmm9 @@ -6196,7 +6196,7 @@ #ifdef movsd xorps %xmm0, %xmm0 xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 @@ -6317,11 +6317,11 @@ #ifdef movsd xorps %xmm0, %xmm0 -#endif +#endif movsd -32 * SIZE(A1), %xmm0 #ifdef movsd xorps %xmm4, %xmm4 -#endif +#endif movsd -32 * SIZE(X1), %xmm4 mulps %xmm4, %xmm0 addps %xmm0, %xmm8 diff --git a/kernel/x86_64/swap.S b/kernel/x86_64/swap.S index 50a7fb557..9529724ab 100644 --- a/kernel/x86_64/swap.S +++ b/kernel/x86_64/swap.S @@ -60,7 +60,7 @@ PROLOGUE PROFCODE - + #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY diff --git a/kernel/x86_64/swap_sse.S b/kernel/x86_64/swap_sse.S index 570287051..dc964dab2 100644 --- a/kernel/x86_64/swap_sse.S +++ b/kernel/x86_64/swap_sse.S @@ -81,7 +81,7 @@ subq $-32 * SIZE, X subq $-32 * SIZE, Y - + cmpq $3, M jle .L16 @@ -307,7 +307,7 @@ .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) pshufd $0x39, %xmm1, %xmm3 movlps %xmm3, -31 * SIZE(X) @@ -791,7 +791,7 @@ .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) subq $3, M diff --git a/kernel/x86_64/swap_sse2.S b/kernel/x86_64/swap_sse2.S index 5f164197d..e9260b979 100644 --- a/kernel/x86_64/swap_sse2.S +++ b/kernel/x86_64/swap_sse2.S @@ -97,7 +97,7 @@ .L10: subq $-16 * SIZE, X subq $-16 * SIZE, Y - + testq $SIZE, X jne .L20 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 5083d0b01..cda0b476d 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -97,7 +97,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -105,14 +105,14 @@ #define M ARG1 #define N ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index a8bbb1cad..0afc1e8c0 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -97,7 +97,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -105,14 +105,14 @@ #define M ARG1 #define N ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 47af7726a..691012cb1 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -97,7 +97,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -105,14 +105,14 @@ #define M ARG1 #define IS ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 57d8c2a20..8ecbb39e6 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -97,7 +97,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -105,14 +105,14 @@ #define M ARG1 #define IS ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) @@ -213,7 +213,7 @@ movq IS, TEMP imulq LDA, TEMP addq TEMP, A - + unpcklpd ALPHA, ALPHA movq BUFFER, XX diff --git a/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S index d70bede70..8deff20aa 100644 --- a/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S +++ b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -94,7 +94,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -181,7 +181,7 @@ movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 8), %rax subq %rax, C #endif @@ -196,7 +196,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -226,7 +226,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -246,7 +246,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -681,7 +681,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: movq M, I @@ -705,7 +705,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -747,7 +747,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 @@ -1373,7 +1373,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L29: #ifdef LN @@ -1412,7 +1412,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -1427,7 +1427,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1453,7 +1453,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1473,7 +1473,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -1699,7 +1699,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L40: movq M, I @@ -1723,7 +1723,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1751,7 +1751,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -2092,7 +2092,7 @@ decq I BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L49: #ifdef LN @@ -2127,7 +2127,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -2142,7 +2142,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2168,7 +2168,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2186,7 +2186,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 @@ -2342,7 +2342,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: movq M, I @@ -2366,7 +2366,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -2390,7 +2390,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -2636,7 +2636,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -2649,7 +2649,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2675,7 +2675,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movhps -15 * SIZE(AO), %xmm0 @@ -2695,7 +2695,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: mulpd %xmm0, %xmm1 movsd -14 * SIZE(AO), %xmm0 movhps -13 * SIZE(AO), %xmm0 @@ -2816,7 +2816,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L80: @@ -2841,7 +2841,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -2864,7 +2864,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -3025,7 +3025,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L89: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_LN_4x2_atom.S b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S index 6ba2fc4bd..a19267499 100644 --- a/kernel/x86_64/trsm_kernel_LN_4x2_atom.S +++ b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -90,7 +90,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -146,7 +146,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -158,7 +158,7 @@ sarq $1, J jle .L40 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -170,7 +170,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -185,7 +185,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -214,7 +214,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 @@ -411,7 +411,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -432,7 +432,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -738,7 +738,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: movq M, I @@ -761,7 +761,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -807,7 +807,7 @@ addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + addsd %xmm6, %xmm15 PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) movaps %xmm4, %xmm6 @@ -1337,7 +1337,7 @@ movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -1350,7 +1350,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1374,7 +1374,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 @@ -1528,7 +1528,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -1548,7 +1548,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -1741,7 +1741,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: movq M, I @@ -1764,7 +1764,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 @@ -2066,7 +2066,7 @@ decq I # i -- jg .L41 - ALIGN_4 + ALIGN_4 .L69: #ifdef LN @@ -2086,7 +2086,7 @@ subq $1, KK #endif ALIGN_2 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S index 4cdaff30b..69278bb5e 100644 --- a/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S +++ b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -300,7 +300,7 @@ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -405,7 +405,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) @@ -470,7 +470,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -508,7 +508,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -541,7 +541,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -806,7 +806,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -833,7 +833,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1170,7 +1170,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: movq M, I @@ -1198,7 +1198,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 @@ -1741,7 +1741,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -1794,7 +1794,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1825,7 +1825,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 1), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1999,7 +1999,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2024,7 +2024,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2253,7 +2253,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: movq M, I @@ -2281,7 +2281,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 @@ -2622,8 +2622,8 @@ decq I # i -- jg .L51 - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -2670,7 +2670,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2698,7 +2698,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2833,13 +2833,13 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L100: testq $2, M je .L110 -#ifdef LN +#ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG @@ -2857,7 +2857,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 @@ -3045,7 +3045,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: movq M, I @@ -3072,7 +3072,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -3341,7 +3341,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -3360,7 +3360,7 @@ subq $1, KK #endif ALIGN_4 - + .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_core2.S b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S index fc5284ae5..3a16bfc3f 100644 --- a/kernel/x86_64/trsm_kernel_LN_4x4_core2.S +++ b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -83,7 +83,7 @@ #define AORIG 32(%rsp) #define BORIG 40(%rsp) #define BUFFER 128(%rsp) - + #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) @@ -92,7 +92,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -161,7 +161,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -180,7 +180,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq 16 * SIZE + BUFFER, BO @@ -196,7 +196,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -212,7 +212,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd -16 * SIZE(B), %xmm0 @@ -241,7 +241,7 @@ unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 - + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) @@ -303,7 +303,7 @@ subq $1, %rax jne .L04 ALIGN_4 - + .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -348,7 +348,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -647,7 +647,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -674,7 +674,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1040,7 +1040,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: movq M, I @@ -1068,7 +1068,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif prefetcht2 0 * SIZE(BB) @@ -1114,7 +1114,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm10 @@ -1739,9 +1739,9 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 + - .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -1777,12 +1777,12 @@ .L41: /* Copying to Sub Buffer */ - + #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -1798,7 +1798,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1814,7 +1814,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -1864,7 +1864,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -1907,7 +1907,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2113,8 +2113,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L60: testq $2, M je .L70 @@ -2140,7 +2140,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2385,7 +2385,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: movq M, I @@ -2413,7 +2413,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2797,7 +2797,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L79: #ifdef LN @@ -2833,7 +2833,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -2849,7 +2849,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2865,7 +2865,7 @@ sarq $3, %rax jle .L83 ALIGN_4 - + .L82: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -2913,7 +2913,7 @@ subq $1, %rax jne .L84 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movq A, AO @@ -2954,7 +2954,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -3124,7 +3124,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -3151,7 +3151,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -3352,7 +3352,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: movq M, I @@ -3380,7 +3380,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 #ifdef LN @@ -3683,7 +3683,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -3707,7 +3707,7 @@ #endif ALIGN_4 - + .L999: movq %r15, %rsp diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S index 09f91220a..3bc7ae1f8 100644 --- a/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -95,7 +95,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -180,7 +180,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -195,7 +195,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -225,7 +225,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -247,7 +247,7 @@ jle .L35 ALIGN_4 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -481,7 +481,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -503,7 +503,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 @@ -526,7 +526,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -857,7 +857,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: movq M, I @@ -881,7 +881,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -936,7 +936,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: addpd %xmm3, %xmm11 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -14 * SIZE(BO), %xmm3 @@ -1666,7 +1666,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -1705,7 +1705,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -1720,7 +1720,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -1730,7 +1730,7 @@ movq OFFSET, %rax movq %rax, KK #endif - + testq $1, M BRANCH jle .L60 @@ -1750,7 +1750,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -1769,7 +1769,7 @@ jle .L75 ALIGN_4 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -1935,7 +1935,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1957,7 +1957,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1977,7 +1977,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -2196,7 +2196,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: movq M, I @@ -2220,7 +2220,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB @@ -2257,7 +2257,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 @@ -2596,7 +2596,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L79: #ifdef LN @@ -2631,7 +2631,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -2644,7 +2644,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2654,7 +2654,7 @@ testq $1, M BRANCH jle .L90 - + #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax @@ -2669,7 +2669,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 @@ -2688,7 +2688,7 @@ jle .L115 ALIGN_4 -.L112: +.L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 @@ -2828,7 +2828,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L90: testq $2, M @@ -2850,7 +2850,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2869,7 +2869,7 @@ jle .L105 ALIGN_4 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -3049,7 +3049,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: movq M, I @@ -3073,7 +3073,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -3100,7 +3100,7 @@ jle .L95 ALIGN_4 -.L92: +.L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -3377,7 +3377,7 @@ decq I BRANCH jg .L91 - ALIGN_4 + ALIGN_4 .L119: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S index ca0bfbdc5..c846080a4 100644 --- a/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S +++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -212,10 +212,10 @@ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 - + PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -284,7 +284,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -303,7 +303,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -319,7 +319,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -337,7 +337,7 @@ addq %rax, %rax ALIGN_4 - + .L02: PREFETCHNTA 40 * SIZE(B) @@ -406,7 +406,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -449,7 +449,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -791,8 +791,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L20: testq $2, M je .L30 @@ -818,7 +818,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1255,7 +1255,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: @@ -1284,7 +1284,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(BO), %xmm9 movapd 2 * SIZE(BO), %xmm11 @@ -1329,7 +1329,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -1968,7 +1968,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -2005,12 +2005,12 @@ .L41: /* Copying to Sub Buffer */ - + #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -2026,7 +2026,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2042,7 +2042,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: PREFETCH 56 * SIZE(B) @@ -2105,7 +2105,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -2148,7 +2148,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2379,7 +2379,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2406,7 +2406,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2688,7 +2688,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: movq M, I @@ -2716,7 +2716,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3163,7 +3163,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L79: #ifdef LN @@ -3199,7 +3199,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -3215,7 +3215,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -3231,7 +3231,7 @@ sarq $3, %rax jle .L83 ALIGN_4 - + .L82: PREFETCH 56 * SIZE(B) @@ -3291,7 +3291,7 @@ decq %rax jne .L84 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movq A, AO @@ -3332,7 +3332,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3511,7 +3511,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -3538,7 +3538,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3748,7 +3748,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: movq M, I @@ -3776,7 +3776,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -4095,7 +4095,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L119: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S index 66a5e40d3..fedeb5a58 100644 --- a/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S +++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -333,7 +333,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -390,7 +390,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -402,7 +402,7 @@ sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -414,7 +414,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -429,7 +429,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -460,7 +460,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -756,8 +756,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L30: testq $2, M BRANCH @@ -780,7 +780,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1159,7 +1159,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: movq M, I @@ -1183,7 +1183,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -1233,7 +1233,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -2121,7 +2121,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -2136,7 +2136,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -2167,7 +2167,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2362,9 +2362,9 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 -.L70: +.L70: testq $2, M je .L60 ALIGN_4 @@ -2384,7 +2384,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2631,7 +2631,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: movq M, I @@ -2654,7 +2654,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-4 * SIZE, BB @@ -3060,9 +3060,9 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 + - .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -3097,7 +3097,7 @@ movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -3110,7 +3110,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -3136,7 +3136,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3289,7 +3289,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -3311,7 +3311,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3505,7 +3505,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L100: movq M, I @@ -3528,7 +3528,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3823,7 +3823,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -3843,7 +3843,7 @@ subq $1, KK #endif ALIGN_2 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S index 28c2ca051..8717fc336 100644 --- a/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S +++ b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -94,7 +94,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -180,7 +180,7 @@ movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 8), %rax subq %rax, C #endif @@ -195,7 +195,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -224,7 +224,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -243,7 +243,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -713,7 +713,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -734,7 +734,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -759,7 +759,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -904,7 +904,7 @@ movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 - + movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm5 @@ -1327,7 +1327,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: movq M, I @@ -1351,7 +1351,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB @@ -1393,7 +1393,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 @@ -2195,8 +2195,8 @@ decq I BRANCH jg .L11 - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -2233,7 +2233,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -2248,7 +2248,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2273,7 +2273,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -2291,7 +2291,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -2522,7 +2522,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -2543,7 +2543,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2563,7 +2563,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -2661,7 +2661,7 @@ movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 - + movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 @@ -2840,7 +2840,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: movq M, I @@ -2864,7 +2864,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -2892,7 +2892,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -3301,7 +3301,7 @@ decq I BRANCH jg .L41 - ALIGN_4 + ALIGN_4 .L69: #ifdef LN @@ -3335,7 +3335,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -3350,7 +3350,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -3375,7 +3375,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -3393,7 +3393,7 @@ jle .L95 ALIGN_3 -.L92: +.L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 @@ -3556,7 +3556,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -3577,7 +3577,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -3597,7 +3597,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -3677,7 +3677,7 @@ #if defined(LN) || defined(LT) pshufd $0xd8, %xmm8, %xmm8 - + movaps -32 * SIZE(BO), %xmm0 #else movaps -32 * SIZE(AO), %xmm0 @@ -3782,7 +3782,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L90: movq M, I @@ -3806,7 +3806,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -3829,7 +3829,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -4110,8 +4110,8 @@ decq I BRANCH jg .L71 - ALIGN_4 - + ALIGN_4 + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -4144,7 +4144,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -4157,7 +4157,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -4182,7 +4182,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 @@ -4199,7 +4199,7 @@ jle .L125 ALIGN_3 -.L122: +.L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 @@ -4325,7 +4325,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -4346,7 +4346,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -4363,7 +4363,7 @@ jle .L115 ALIGN_3 -.L112: +.L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -4549,7 +4549,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -4568,7 +4568,7 @@ jle .L105 ALIGN_3 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -4798,7 +4798,7 @@ decq I BRANCH jg .L101 - ALIGN_4 + ALIGN_4 .L129: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S index 552dbacdc..a31874763 100644 --- a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -107,11 +107,11 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp EMMS - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -179,7 +179,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -198,10 +198,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax @@ -214,7 +214,7 @@ salq $2 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -230,7 +230,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -312,7 +312,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -352,7 +352,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -731,7 +731,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -756,7 +756,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 @@ -1179,7 +1179,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $4, M @@ -1204,7 +1204,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1700,7 +1700,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L40: movq M, I @@ -1728,7 +1728,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 @@ -1762,7 +1762,7 @@ sarq $2, %rax je .L15 ALIGN_4 - + .L12: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 @@ -2574,7 +2574,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L49: #ifdef LN @@ -2608,10 +2608,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax @@ -2624,7 +2624,7 @@ salq $1 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2640,7 +2640,7 @@ sarq $2, %rax jle .L53 ALIGN_4 - + .L52: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -2697,7 +2697,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: #if defined(LT) || defined(RN) movq A, AO @@ -2736,7 +2736,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -2987,7 +2987,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $2, M @@ -3012,7 +3012,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 @@ -3306,7 +3306,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L80: testq $4, M @@ -3331,7 +3331,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3695,7 +3695,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L90: movq M, I @@ -3723,7 +3723,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -4427,7 +4427,7 @@ decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN @@ -4459,10 +4459,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax @@ -4475,7 +4475,7 @@ salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -4548,7 +4548,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: #if defined(LT) || defined(RN) movq A, AO @@ -4586,7 +4586,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -4753,7 +4753,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L120: testq $2, M @@ -4778,7 +4778,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 @@ -5001,7 +5001,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L130: testq $4, M @@ -5026,7 +5026,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -5311,7 +5311,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L140: movq M, I @@ -5339,7 +5339,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -5895,7 +5895,7 @@ decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L149: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S index b04299ab9..39ed586ce 100644 --- a/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S +++ b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -94,7 +94,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -136,7 +136,7 @@ movq OLD_LDC, LDC movq OLD_OFFSET, KK - + leaq (, LDC, SIZE), LDC movq KK, OFFSET @@ -181,7 +181,7 @@ movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 8), %rax subq %rax, C #endif @@ -196,7 +196,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -228,7 +228,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -270,7 +270,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 @@ -898,7 +898,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -920,7 +920,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -940,7 +940,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -1376,7 +1376,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L29: #ifdef LN @@ -1415,7 +1415,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -1430,7 +1430,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1458,7 +1458,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1486,7 +1486,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1827,7 +1827,7 @@ decq I BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1849,7 +1849,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1869,7 +1869,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -2095,7 +2095,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L49: #ifdef LN @@ -2130,7 +2130,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -2145,7 +2145,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2173,7 +2173,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -2197,7 +2197,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -2430,7 +2430,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2448,7 +2448,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 @@ -2604,7 +2604,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L69: #ifdef LN @@ -2639,7 +2639,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -2652,7 +2652,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2680,7 +2680,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -2703,7 +2703,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -2864,7 +2864,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $1, M @@ -2886,7 +2886,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movhps -15 * SIZE(AO), %xmm0 @@ -2906,7 +2906,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: mulpd %xmm0, %xmm1 movsd -14 * SIZE(AO), %xmm0 movhps -13 * SIZE(AO), %xmm0 @@ -3027,7 +3027,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L89: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_LT_4x2_atom.S b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S index c6ad0a2cc..04b7e2de1 100644 --- a/kernel/x86_64/trsm_kernel_LT_4x2_atom.S +++ b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -90,7 +90,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -146,7 +146,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -158,7 +158,7 @@ sarq $1, J jle .L40 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -170,7 +170,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -185,7 +185,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -216,7 +216,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -262,7 +262,7 @@ addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + addsd %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 @@ -775,7 +775,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -1081,7 +1081,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1102,7 +1102,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 @@ -1299,8 +1299,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -1337,7 +1337,7 @@ movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -1350,7 +1350,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1377,7 +1377,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 @@ -1679,7 +1679,7 @@ decq I # i -- jg .L41 - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -1699,7 +1699,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -1892,7 +1892,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1912,7 +1912,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 @@ -2066,7 +2066,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L69: #ifdef LN @@ -2086,7 +2086,7 @@ subq $1, KK #endif ALIGN_2 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S index b133bcf4b..b3712004c 100644 --- a/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S +++ b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -300,7 +300,7 @@ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -405,7 +405,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) @@ -470,7 +470,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -508,7 +508,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -544,7 +544,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 @@ -1087,7 +1087,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -1117,7 +1117,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1454,7 +1454,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1479,7 +1479,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1744,8 +1744,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -1797,7 +1797,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1829,7 +1829,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 @@ -2170,7 +2170,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2195,7 +2195,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2424,7 +2424,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -2451,7 +2451,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 1), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2625,8 +2625,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -2673,7 +2673,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2704,7 +2704,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2973,13 +2973,13 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M je .L110 -#ifdef LN +#ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG @@ -2997,7 +2997,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 @@ -3185,7 +3185,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -3211,7 +3211,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -3346,7 +3346,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -3366,7 +3366,7 @@ #endif ALIGN_4 - + .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_core2.S b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S index 7864ec550..c5fdc5282 100644 --- a/kernel/x86_64/trsm_kernel_LT_4x4_core2.S +++ b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -83,7 +83,7 @@ #define AORIG 32(%rsp) #define BORIG 40(%rsp) #define BUFFER 128(%rsp) - + #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) @@ -92,7 +92,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -161,7 +161,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -180,7 +180,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq 16 * SIZE + BUFFER, BO @@ -196,7 +196,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -212,7 +212,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd -16 * SIZE(B), %xmm0 @@ -241,7 +241,7 @@ unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 - + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) @@ -303,7 +303,7 @@ subq $1, %rax jne .L04 ALIGN_4 - + .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -349,7 +349,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif prefetcht2 0 * SIZE(BB) @@ -384,7 +384,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm10 @@ -1009,7 +1009,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -1039,7 +1039,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1405,7 +1405,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1432,7 +1432,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1731,8 +1731,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -1768,12 +1768,12 @@ .L41: /* Copying to Sub Buffer */ - + #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -1789,7 +1789,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1805,7 +1805,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -1855,7 +1855,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -1899,7 +1899,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2283,7 +2283,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2310,7 +2310,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2555,7 +2555,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -2582,7 +2582,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2788,8 +2788,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -2824,7 +2824,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -2840,7 +2840,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2856,7 +2856,7 @@ sarq $3, %rax jle .L83 ALIGN_4 - + .L82: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -2904,7 +2904,7 @@ subq $1, %rax jne .L84 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movq A, AO @@ -2946,7 +2946,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 #ifdef LN @@ -3249,7 +3249,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -3276,7 +3276,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -3477,7 +3477,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -3504,7 +3504,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -3674,7 +3674,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -3698,7 +3698,7 @@ #endif ALIGN_4 - + .L999: movq %r15, %rsp diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S index 77fc0c5c0..e186b9452 100644 --- a/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -95,7 +95,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -180,7 +180,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -195,7 +195,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -227,7 +227,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -282,7 +282,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: addpd %xmm3, %xmm11 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -14 * SIZE(BO), %xmm3 @@ -1012,7 +1012,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -1034,7 +1034,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 @@ -1057,7 +1057,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -1388,7 +1388,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1410,7 +1410,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -1432,7 +1432,7 @@ jle .L35 ALIGN_4 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -1666,7 +1666,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -1705,7 +1705,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -1720,7 +1720,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -1730,7 +1730,7 @@ movq OFFSET, %rax movq %rax, KK #endif - + movq M, I sarq $2, I # i = (m >> 2) NOBRANCH @@ -1752,7 +1752,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB @@ -1788,7 +1788,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 @@ -2127,7 +2127,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2149,7 +2149,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2169,7 +2169,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -2388,7 +2388,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -2410,7 +2410,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -2429,7 +2429,7 @@ jle .L75 ALIGN_4 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -2595,7 +2595,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L79: #ifdef LN @@ -2630,7 +2630,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -2643,7 +2643,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2671,7 +2671,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -2698,7 +2698,7 @@ jle .L95 ALIGN_4 -.L92: +.L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -2975,7 +2975,7 @@ decq I BRANCH jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -2997,7 +2997,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -3016,7 +3016,7 @@ jle .L105 ALIGN_4 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -3196,13 +3196,13 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M BRANCH jle .L119 - + #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax @@ -3217,7 +3217,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 @@ -3236,7 +3236,7 @@ jle .L115 ALIGN_4 -.L112: +.L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 @@ -3376,7 +3376,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S index d50c8d501..583fb47a5 100644 --- a/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S +++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -99,7 +99,7 @@ #define PREFETCHSIZE (8 * 4 + 4) #endif -#ifdef OPTERON +#ifdef OPTERON #define movsd movlpd #endif @@ -216,10 +216,10 @@ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 - + PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -288,7 +288,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -307,7 +307,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -323,7 +323,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -341,7 +341,7 @@ addq %rax, %rax ALIGN_4 - + .L02: PREFETCHNTA 40 * SIZE(B) @@ -410,7 +410,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -454,7 +454,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(BO), %xmm9 movapd 2 * SIZE(BO), %xmm11 @@ -490,7 +490,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -576,7 +576,7 @@ sarq $3, %rax je .L15 -.L12: +.L12: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -1159,7 +1159,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -1189,7 +1189,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1626,7 +1626,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1653,7 +1653,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1995,8 +1995,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -2032,12 +2032,12 @@ .L41: /* Copying to Sub Buffer */ - + #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -2053,7 +2053,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2069,7 +2069,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: PREFETCH 56 * SIZE(B) @@ -2132,7 +2132,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -2176,7 +2176,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2618,7 +2618,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2645,7 +2645,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2927,7 +2927,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -2954,7 +2954,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3185,8 +3185,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -3221,7 +3221,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -3237,7 +3237,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -3253,7 +3253,7 @@ sarq $3, %rax jle .L83 ALIGN_4 - + .L82: PREFETCH 56 * SIZE(B) @@ -3313,7 +3313,7 @@ decq %rax jne .L84 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movq A, AO @@ -3355,7 +3355,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3670,7 +3670,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -3697,7 +3697,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3907,7 +3907,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -3934,7 +3934,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -4113,7 +4113,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -4137,7 +4137,7 @@ #endif ALIGN_4 - + .L999: movq %rbx, %rsp diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S index 266f44243..3856c7209 100644 --- a/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S +++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -333,7 +333,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -389,7 +389,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -401,7 +401,7 @@ sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -413,7 +413,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -428,7 +428,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -459,7 +459,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -498,7 +498,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -1369,7 +1369,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1748,7 +1748,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1771,7 +1771,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2067,8 +2067,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -2105,7 +2105,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -2120,7 +2120,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -2151,7 +2151,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-4 * SIZE, BB @@ -2557,7 +2557,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2579,7 +2579,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2826,7 +2826,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -2848,7 +2848,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3043,8 +3043,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -3079,7 +3079,7 @@ movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -3093,7 +3093,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -3120,7 +3120,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3415,7 +3415,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -3437,7 +3437,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3631,7 +3631,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -3653,7 +3653,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm9 pxor %xmm0, %xmm0 @@ -3806,7 +3806,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -3826,7 +3826,7 @@ subq $1, KK #endif ALIGN_2 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S index 917f8f9a5..28f38bd61 100644 --- a/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S +++ b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -94,7 +94,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -180,7 +180,7 @@ movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 8), %rax subq %rax, C #endif @@ -195,7 +195,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -227,7 +227,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB @@ -269,7 +269,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 @@ -1071,7 +1071,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -1092,7 +1092,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1117,7 +1117,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1262,7 +1262,7 @@ movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 - + movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm5 @@ -1685,7 +1685,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1706,7 +1706,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -1725,7 +1725,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -2195,8 +2195,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -2233,7 +2233,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -2248,7 +2248,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2276,7 +2276,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -2304,7 +2304,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -2713,7 +2713,7 @@ decq I BRANCH jg .L41 - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -2734,7 +2734,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2754,7 +2754,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -2852,7 +2852,7 @@ movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 - + movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 @@ -3031,7 +3031,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -3052,7 +3052,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -3070,7 +3070,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -3301,8 +3301,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L69: #ifdef LN leaq (, K, SIZE), %rax @@ -3335,7 +3335,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -3350,7 +3350,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -3378,7 +3378,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -3401,7 +3401,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -3682,7 +3682,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -3703,7 +3703,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -3723,7 +3723,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -3803,7 +3803,7 @@ #if defined(LN) || defined(LT) pshufd $0xd8, %xmm8, %xmm8 - + movaps -32 * SIZE(BO), %xmm0 #else movaps -32 * SIZE(AO), %xmm0 @@ -3908,7 +3908,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -3929,7 +3929,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -3947,7 +3947,7 @@ jle .L95 ALIGN_3 -.L92: +.L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 @@ -4110,8 +4110,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -4144,7 +4144,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -4157,7 +4157,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -4185,7 +4185,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -4204,7 +4204,7 @@ jle .L105 ALIGN_3 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -4434,7 +4434,7 @@ decq I BRANCH jg .L101 - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -4455,7 +4455,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -4472,7 +4472,7 @@ jle .L115 ALIGN_3 -.L112: +.L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -4655,7 +4655,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 @@ -4672,7 +4672,7 @@ jle .L125 ALIGN_3 -.L122: +.L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 @@ -4798,8 +4798,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L129: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S index 7727fd591..887c071ba 100644 --- a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -53,7 +53,7 @@ #define BO %r14 #define CO1 %r15 #define CO2 %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -107,11 +107,11 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp EMMS - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -179,7 +179,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -198,10 +198,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax @@ -214,7 +214,7 @@ salq $2 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -230,7 +230,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -312,7 +312,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -356,7 +356,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 @@ -390,7 +390,7 @@ sarq $2, %rax je .L15 ALIGN_4 - + .L12: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 @@ -1202,7 +1202,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -1227,7 +1227,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1723,7 +1723,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1748,7 +1748,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 @@ -2171,7 +2171,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -2195,7 +2195,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -2574,8 +2574,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L49: #ifdef LN leaq (, K, SIZE), %rax @@ -2608,10 +2608,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax @@ -2624,7 +2624,7 @@ salq $1 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2640,7 +2640,7 @@ sarq $2, %rax jle .L53 ALIGN_4 - + .L52: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -2697,7 +2697,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: #if defined(LT) || defined(RN) movq A, AO @@ -2740,7 +2740,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3444,7 +3444,7 @@ decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -3469,7 +3469,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3833,7 +3833,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -3858,7 +3858,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 @@ -4152,7 +4152,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -4176,7 +4176,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -4426,8 +4426,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -4458,10 +4458,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax @@ -4474,7 +4474,7 @@ salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -4547,7 +4547,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: #if defined(LT) || defined(RN) movq A, AO @@ -4589,7 +4589,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -5145,7 +5145,7 @@ decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -5170,7 +5170,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -5455,7 +5455,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -5480,7 +5480,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 8 * SIZE(AO), %xmm10 @@ -5703,7 +5703,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -5727,7 +5727,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -5894,7 +5894,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L149: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S index 8c7f92fbd..b8e75e798 100644 --- a/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S +++ b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -94,7 +94,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -178,7 +178,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -191,7 +191,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -219,7 +219,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -242,7 +242,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -403,7 +403,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $1, M @@ -425,7 +425,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movhps -15 * SIZE(AO), %xmm0 @@ -445,7 +445,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: mulpd %xmm0, %xmm1 movsd -14 * SIZE(AO), %xmm0 movhps -13 * SIZE(AO), %xmm0 @@ -566,7 +566,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L89: #ifdef LN @@ -601,7 +601,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -616,7 +616,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -644,7 +644,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -668,7 +668,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -901,7 +901,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -919,7 +919,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 @@ -1075,7 +1075,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L69: #ifdef LN @@ -1110,7 +1110,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -1125,7 +1125,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1153,7 +1153,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1181,7 +1181,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1522,7 +1522,7 @@ decq I BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1544,7 +1544,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1564,7 +1564,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -1790,7 +1790,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L49: #ifdef LN @@ -1828,7 +1828,7 @@ movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 8), %rax subq %rax, C #endif @@ -1843,7 +1843,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1875,7 +1875,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -1918,7 +1918,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 @@ -2546,7 +2546,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -2568,7 +2568,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2588,7 +2588,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -3024,7 +3024,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L29: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_RT_4x2_atom.S b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S index ae49c3837..9b5a93717 100644 --- a/kernel/x86_64/trsm_kernel_RT_4x2_atom.S +++ b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -90,7 +90,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -146,7 +146,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -168,7 +168,7 @@ movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -181,7 +181,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -208,7 +208,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm9, %xmm9 @@ -510,7 +510,7 @@ decq I # i -- jg .L41 - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -530,7 +530,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -723,7 +723,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -743,7 +743,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm5, %xmm5 @@ -897,7 +897,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L69: #ifdef LN @@ -923,7 +923,7 @@ sarq $1, J jle .L999 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -935,7 +935,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -950,7 +950,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -981,7 +981,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -1027,7 +1027,7 @@ addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + addsd %xmm6, %xmm15 PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) movaps %xmm4, %xmm6 @@ -1540,7 +1540,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -1846,7 +1846,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1867,7 +1867,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm7, %xmm7 @@ -2064,8 +2064,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S index 400f60ecb..08e92dc30 100644 --- a/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S +++ b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -300,7 +300,7 @@ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -405,7 +405,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) @@ -470,7 +470,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -504,7 +504,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -535,7 +535,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -804,13 +804,13 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M je .L110 -#ifdef LN +#ifdef LN movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, AORIG @@ -828,7 +828,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 @@ -1016,7 +1016,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -1040,7 +1040,7 @@ #if defined(LN) || defined(RT) movq KK, %rax leaq (BO, %rax, SIZE), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1175,7 +1175,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -1195,7 +1195,7 @@ #endif ALIGN_4 -.L40: +.L40: testq $2, N je .L80 @@ -1224,7 +1224,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1256,7 +1256,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 @@ -1597,7 +1597,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1622,7 +1622,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1851,7 +1851,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1878,7 +1878,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 1), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2052,8 +2052,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -2104,7 +2104,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -2141,7 +2141,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 @@ -2683,7 +2683,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -2713,7 +2713,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -3050,7 +3050,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -3075,7 +3075,7 @@ movq KK, %rax leaq (, %rax, SIZE), %rax leaq (BO, %rax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -3340,8 +3340,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_core2.S b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S index 89d07cef5..64e0342de 100644 --- a/kernel/x86_64/trsm_kernel_RT_4x4_core2.S +++ b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -83,7 +83,7 @@ #define AORIG 32(%rsp) #define BORIG 40(%rsp) #define BUFFER 128(%rsp) - + #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) @@ -92,7 +92,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -161,7 +161,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -180,7 +180,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -196,7 +196,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -212,7 +212,7 @@ sarq $3, %rax jle .L83 ALIGN_4 - + .L82: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -260,7 +260,7 @@ subq $1, %rax jne .L84 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movq A, AO @@ -302,7 +302,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 #ifdef LN @@ -605,7 +605,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -632,7 +632,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -833,7 +833,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -860,7 +860,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1030,7 +1030,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -1061,12 +1061,12 @@ .L41: /* Copying to Sub Buffer */ - + #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -1082,7 +1082,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1098,7 +1098,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movddup -16 * SIZE(B), %xmm0 movddup -15 * SIZE(B), %xmm1 @@ -1148,7 +1148,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -1192,7 +1192,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1576,7 +1576,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1603,7 +1603,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1848,7 +1848,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1875,7 +1875,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2081,8 +2081,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -2117,7 +2117,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq 16 * SIZE + BUFFER, BO @@ -2133,7 +2133,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -2149,7 +2149,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movapd -16 * SIZE(B), %xmm0 @@ -2178,7 +2178,7 @@ unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 - + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) @@ -2240,7 +2240,7 @@ subq $1, %rax jne .L04 ALIGN_4 - + .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -2286,7 +2286,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2332,7 +2332,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm10 @@ -2957,7 +2957,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -2987,7 +2987,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -3353,7 +3353,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -3380,7 +3380,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -3679,8 +3679,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S index a575d4cb1..f95200ae5 100644 --- a/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -95,7 +95,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -177,7 +177,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -190,7 +190,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -218,7 +218,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -245,7 +245,7 @@ jle .L95 ALIGN_4 -.L92: +.L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -522,7 +522,7 @@ decq I BRANCH jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -544,7 +544,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -563,7 +563,7 @@ jle .L105 ALIGN_4 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -743,13 +743,13 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M BRANCH jle .L119 - + #ifdef LN movq K, %rax salq $BASE_SHIFT, %rax @@ -764,7 +764,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movsd -16 * SIZE(BO), %xmm2 @@ -783,7 +783,7 @@ jle .L115 ALIGN_4 -.L112: +.L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 @@ -923,7 +923,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -957,7 +957,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -972,7 +972,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 1, %rax @@ -983,7 +983,7 @@ movq OFFSET, %rax movq %rax, KK #endif - + movq M, I sarq $2, I # i = (m >> 2) NOBRANCH @@ -1005,7 +1005,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB @@ -1041,7 +1041,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 @@ -1380,7 +1380,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1402,7 +1402,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1422,7 +1422,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -1641,7 +1641,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1663,7 +1663,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -1682,7 +1682,7 @@ jle .L75 ALIGN_4 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -1848,7 +1848,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L79: #ifdef LN @@ -1886,7 +1886,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -1901,7 +1901,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif movq K, %rax salq $BASE_SHIFT + 2, %rax @@ -1934,7 +1934,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -1989,7 +1989,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: addpd %xmm3, %xmm11 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -14 * SIZE(BO), %xmm3 @@ -2719,7 +2719,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -2741,7 +2741,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 @@ -2764,7 +2764,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: addpd %xmm3, %xmm11 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -3095,7 +3095,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -3117,7 +3117,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -3139,7 +3139,7 @@ jle .L35 ALIGN_4 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -3373,7 +3373,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L39: #ifdef LN diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S index 07c978ee9..49a5fe6c2 100644 --- a/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S +++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -99,7 +99,7 @@ #define PREFETCHSIZE (8 * 4 + 4) #endif -#ifdef OPTERON +#ifdef OPTERON #define movsd movlpd #endif @@ -216,10 +216,10 @@ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 - + PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -288,7 +288,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -307,7 +307,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -323,7 +323,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -339,7 +339,7 @@ sarq $3, %rax jle .L83 ALIGN_4 - + .L82: PREFETCH 56 * SIZE(B) @@ -399,7 +399,7 @@ decq %rax jne .L84 ALIGN_4 - + .L90: #if defined(LT) || defined(RN) movq A, AO @@ -441,7 +441,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -756,7 +756,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -783,7 +783,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -993,7 +993,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -1020,7 +1020,7 @@ movq KK, %rax salq $0 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1199,7 +1199,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -1231,12 +1231,12 @@ .L41: /* Copying to Sub Buffer */ - + #ifdef LN movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -1252,7 +1252,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1268,7 +1268,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: PREFETCH 56 * SIZE(B) @@ -1331,7 +1331,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -1375,7 +1375,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1817,7 +1817,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1844,7 +1844,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2126,7 +2126,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -2153,7 +2153,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2384,8 +2384,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -2420,7 +2420,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -2436,7 +2436,7 @@ leaq (, %rax, SIZE), %rax leaq (B, %rax, 4), B leaq (BO, %rax, 8), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2454,7 +2454,7 @@ addq %rax, %rax ALIGN_4 - + .L02: PREFETCHNTA 40 * SIZE(B) @@ -2523,7 +2523,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -2567,7 +2567,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(BO), %xmm9 movapd 2 * SIZE(BO), %xmm11 @@ -2601,7 +2601,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -3240,7 +3240,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -3270,7 +3270,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3707,7 +3707,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -3734,7 +3734,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -4076,8 +4076,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S index f0e8bf9a3..b6c56e056 100644 --- a/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S +++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -331,7 +331,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -387,7 +387,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -409,7 +409,7 @@ movq K, %rax salq $0 + BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -423,7 +423,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -450,7 +450,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -745,7 +745,7 @@ decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -767,7 +767,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -961,7 +961,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -983,7 +983,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1136,7 +1136,7 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L119: #ifdef LN @@ -1172,7 +1172,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -1187,7 +1187,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1214,7 +1214,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1617,7 +1617,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1639,7 +1639,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1886,7 +1886,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1908,7 +1908,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2103,8 +2103,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L79: #ifdef LN leaq (, K, SIZE), %rax @@ -2129,7 +2129,7 @@ sarq $2, J # j = (n >> 2) jle .L999 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -2141,7 +2141,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -2156,7 +2156,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2184,7 +2184,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2220,7 +2220,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -3093,7 +3093,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3472,7 +3472,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -3495,7 +3495,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -3791,8 +3791,8 @@ salq $0 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax @@ -3814,7 +3814,7 @@ jg .L10 ALIGN_4 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S index ffac798e3..4942f4671 100644 --- a/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S +++ b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -94,7 +94,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -176,7 +176,7 @@ movq K, %rax salq $BASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -189,7 +189,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -217,7 +217,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -236,7 +236,7 @@ jle .L105 ALIGN_3 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -466,7 +466,7 @@ decq I BRANCH jg .L101 - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -487,7 +487,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -504,7 +504,7 @@ jle .L115 ALIGN_3 -.L112: +.L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -687,7 +687,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 @@ -704,7 +704,7 @@ jle .L125 ALIGN_3 -.L122: +.L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 @@ -830,8 +830,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L129: #ifdef LN leaq (, K, SIZE), %rax @@ -864,7 +864,7 @@ movq K, %rax salq $1 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -879,7 +879,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -907,7 +907,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -930,7 +930,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -1211,7 +1211,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -1232,7 +1232,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1252,7 +1252,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -1332,7 +1332,7 @@ #if defined(LN) || defined(LT) pshufd $0xd8, %xmm8, %xmm8 - + movaps -32 * SIZE(BO), %xmm0 #else movaps -32 * SIZE(AO), %xmm0 @@ -1437,7 +1437,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -1458,7 +1458,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -1476,7 +1476,7 @@ jle .L95 ALIGN_3 -.L92: +.L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 @@ -1639,8 +1639,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -1673,7 +1673,7 @@ movq K, %rax salq $2 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -1688,7 +1688,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1716,7 +1716,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -1744,7 +1744,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -2153,7 +2153,7 @@ decq I BRANCH jg .L41 - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -2174,7 +2174,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2194,7 +2194,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -2292,7 +2292,7 @@ movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 - + movaps -32 * SIZE(BO), %xmm0 movaps -28 * SIZE(BO), %xmm1 @@ -2471,7 +2471,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -2492,7 +2492,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -2510,7 +2510,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -2741,8 +2741,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L69: #ifdef LN leaq (, K, SIZE), %rax @@ -2779,7 +2779,7 @@ movq K, %rax salq $3 + BASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 8), %rax subq %rax, C #endif @@ -2794,7 +2794,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -2826,7 +2826,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB @@ -2868,7 +2868,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 @@ -3670,7 +3670,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -3691,7 +3691,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -3716,7 +3716,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -3861,7 +3861,7 @@ movaps %xmm8, %xmm4 shufps $0x88, %xmm9, %xmm8 shufps $0xdd, %xmm9, %xmm4 - + movaps %xmm10, %xmm5 shufps $0x88, %xmm11, %xmm10 shufps $0xdd, %xmm11, %xmm5 @@ -4284,7 +4284,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -4305,7 +4305,7 @@ leaq (B, %rax, 8), BO #else movq B, BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -4324,7 +4324,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -4794,8 +4794,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L39: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S index 699364941..c854b9308 100644 --- a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -53,7 +53,7 @@ #define BO %r14 #define CO1 %r15 #define CO2 %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -107,7 +107,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -177,7 +177,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -192,10 +192,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $BASE_SHIFT, %rax @@ -208,7 +208,7 @@ salq $BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -281,7 +281,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: #if defined(LT) || defined(RN) movq A, AO @@ -323,7 +323,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -879,7 +879,7 @@ decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -904,7 +904,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1189,7 +1189,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -1214,7 +1214,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 @@ -1439,7 +1439,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -1463,7 +1463,7 @@ movq KK, %rax salq $BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -1630,7 +1630,7 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L149: #ifdef LN @@ -1662,10 +1662,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + BASE_SHIFT, %rax @@ -1678,7 +1678,7 @@ salq $1 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -1694,7 +1694,7 @@ sarq $2, %rax jle .L53 ALIGN_4 - + .L52: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -1751,7 +1751,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: #if defined(LT) || defined(RN) movq A, AO @@ -1794,7 +1794,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -2498,7 +2498,7 @@ decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -2523,7 +2523,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -2887,7 +2887,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2912,7 +2912,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif #ifdef movsd xorps %xmm8, %xmm8 @@ -3224,7 +3224,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -3248,7 +3248,7 @@ movq KK, %rax salq $1 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -3498,8 +3498,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -3534,10 +3534,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $2 + BASE_SHIFT, %rax @@ -3550,7 +3550,7 @@ salq $2 + BASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -3566,7 +3566,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -3648,7 +3648,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -3692,7 +3692,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 @@ -3726,7 +3726,7 @@ sarq $2, %rax je .L15 ALIGN_4 - + .L12: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 @@ -4538,7 +4538,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -4563,7 +4563,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -5059,7 +5059,7 @@ salq $2 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -5084,7 +5084,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif #ifdef movsd xorps %xmm8, %xmm8 @@ -5513,7 +5513,7 @@ salq $1 + BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -5537,7 +5537,7 @@ movq KK, %rax salq $2 + BASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -5916,8 +5916,8 @@ salq $BASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L49: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S index 966b49960..ea97164b2 100644 --- a/kernel/x86_64/xdot.S +++ b/kernel/x86_64/xdot.S @@ -41,7 +41,7 @@ #define STACK 12 #define ARGS 0 - + #define RESULT 4 + STACK + ARGS(%esp) #define STACK_N 8 + STACK + ARGS(%esp) #define STACK_X 12 + STACK + ARGS(%esp) diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S index 6d116a1d7..843fc243a 100644 --- a/kernel/x86_64/xgemm3m_kernel_2x2.S +++ b/kernel/x86_64/xgemm3m_kernel_2x2.S @@ -46,7 +46,7 @@ #define B ARG5 #define C ARG6 #define LDC %r10 - + #define I %r12 #define J %r13 #define AO %r14 @@ -77,7 +77,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -93,10 +93,10 @@ negq %rax movq %rax, KK #endif - + addq $8 * SIZE, A addq $8 * SIZE, B - + salq $ZBASE_SHIFT, LDC movq N, %rax @@ -109,7 +109,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq A, AO @@ -132,7 +132,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif fldz fldz @@ -152,7 +152,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -178,7 +178,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -196,7 +196,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -216,7 +216,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -234,7 +234,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -270,7 +270,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -379,7 +379,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 2), BO -#endif +#endif fldz fldz @@ -389,7 +389,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -542,13 +542,13 @@ .L30: movq N, %rax - testq $1, %rax + testq $1, %rax je .L999 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq A, AO @@ -570,7 +570,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq ( B, %rax, 1), BO -#endif +#endif fldz fldz @@ -586,7 +586,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -741,7 +741,7 @@ salq $BASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq ( B, %rax, 1), BO -#endif +#endif fldz @@ -750,7 +750,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S index 164e618a4..e0cd1f1df 100644 --- a/kernel/x86_64/xgemm_kernel_1x1.S +++ b/kernel/x86_64/xgemm_kernel_1x1.S @@ -46,7 +46,7 @@ #define B ARG5 #define C ARG6 #define LDC %r10 - + #define I %r12 #define J %r13 #define AO %r14 @@ -96,7 +96,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -112,15 +112,15 @@ negq %rax movq %rax, KK #endif - + addq $8 * SIZE, A addq $8 * SIZE, B - + salq $ZBASE_SHIFT, LDC cmpq $0, M jle .L999 - + movq N, %rax movq %rax, J testq %rax, %rax @@ -131,7 +131,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq A, AO @@ -151,7 +151,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO -#endif +#endif fldz fldz @@ -169,7 +169,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -195,7 +195,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -213,7 +213,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -233,7 +233,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -251,7 +251,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -287,7 +287,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -332,7 +332,7 @@ FST 1 * SIZE(CO) FST 0 * SIZE(CO) #endif - + #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S index db6d80a98..cbde6402d 100644 --- a/kernel/x86_64/xgemv_n.S +++ b/kernel/x86_64/xgemv_n.S @@ -41,9 +41,9 @@ #include "l2param.h" #define P 32 - + #define STACKSIZE 80 - + #define ALPHA_R 8 + STACKSIZE(%rsp) #define ALPHA_I 24 + STACKSIZE(%rsp) #define OLD_INCX 40 + STACKSIZE(%rsp) @@ -71,7 +71,7 @@ #define Y1 %r14 #define XP %r15 #define MIN_N %rbx - + PROLOGUE PROFCODE diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S index c09dcf064..31320f651 100644 --- a/kernel/x86_64/xgemv_t.S +++ b/kernel/x86_64/xgemv_t.S @@ -42,7 +42,7 @@ #define STACKSIZE 80 #define P 4096 - + #define ALPHA_R 8 + STACKSIZE(%rsp) #define ALPHA_I 24 + STACKSIZE(%rsp) #define OLD_INCX 40 + STACKSIZE(%rsp) @@ -70,7 +70,7 @@ #define X1 %r13 #define Y1 %r14 #define MIN_M %rbx - + PROLOGUE PROFCODE diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S index 86d4a748b..a61a240fd 100644 --- a/kernel/x86_64/xtrsm_kernel_LT_1x1.S +++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S @@ -46,7 +46,7 @@ #define B ARG5 #define C ARG6 #define LDC %r10 - + #define I %r12 #define J %r13 #define AO %r14 @@ -59,7 +59,7 @@ #define KK %r11 #define AORIG 48(%rsp) - + #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw @@ -89,7 +89,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -104,7 +104,7 @@ addq $8 * SIZE, A addq $8 * SIZE, B - + #ifdef LN movq M, %rax salq $ZBASE_SHIFT, %rax @@ -128,7 +128,7 @@ movq OFFSET, %rax negq %rax movq %rax, KK -#endif +#endif #ifdef RT movq N, %rax @@ -138,7 +138,7 @@ cmpq $0, M jle .L999 - + movq N, %rax movq %rax, J testq %rax, %rax @@ -170,7 +170,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif #ifdef LT movq OFFSET, %rax @@ -194,7 +194,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif fldz fldz @@ -229,7 +229,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -247,7 +247,7 @@ FLD -5 * SIZE(BO) fmul %st, %st(2) - + FLD -5 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -267,7 +267,7 @@ FLD -3 * SIZE(BO) fmul %st, %st(2) - + FLD -3 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -285,7 +285,7 @@ FLD -1 * SIZE(BO) fmul %st, %st(2) - + FLD -1 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -322,7 +322,7 @@ FLD -7 * SIZE(BO) fmul %st, %st(2) - + FLD -7 * SIZE(AO) fmul %st, %st(2) fmulp %st, %st(1) @@ -341,7 +341,7 @@ .L18: faddp %st, %st(3) faddp %st, %st(1) - + fxch %st(1) #if defined(LN) || defined(RT) @@ -421,7 +421,7 @@ FST 0 * SIZE(CO) FST 1 * SIZE(CO) - + #ifndef LN addq $2 * SIZE, CO #endif diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S index 21d96b640..74e127e6c 100644 --- a/kernel/x86_64/zamax.S +++ b/kernel/x86_64/zamax.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 #define X ARG2 #define INCX ARG3 @@ -67,9 +67,9 @@ ffreep %st FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) addq INCX, X decq M @@ -82,16 +82,16 @@ sarq $2, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -99,9 +99,9 @@ ffreep %st FLD 2 * SIZE(X) - fabs + fabs FLD 3 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -109,9 +109,9 @@ ffreep %st FLD 4 * SIZE(X) - fabs + fabs FLD 5 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -119,9 +119,9 @@ ffreep %st FLD 6 * SIZE(X) - fabs + fabs FLD 7 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -143,9 +143,9 @@ .L21: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) @@ -163,12 +163,12 @@ sarq $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -177,9 +177,9 @@ ffreep %st FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -188,9 +188,9 @@ ffreep %st FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -199,9 +199,9 @@ ffreep %st FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs addq INCX, X faddp %st, %st(1) fcomi @@ -221,9 +221,9 @@ .L61: FLD 0 * SIZE(X) - fabs + fabs FLD 1 * SIZE(X) - fabs + fabs faddp %st, %st(1) fcomi FMOV %st(1), %st(0) diff --git a/kernel/x86_64/zamax_atom.S b/kernel/x86_64/zamax_atom.S index 3f6757410..8b4e144f0 100644 --- a/kernel/x86_64/zamax_atom.S +++ b/kernel/x86_64/zamax_atom.S @@ -38,13 +38,13 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #ifdef USE_MIN #define maxsd minsd #endif @@ -103,7 +103,7 @@ decq I jle .L13 ALIGN_4 - + .L12: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -183,7 +183,7 @@ maxsd %xmm5, %xmm0 maxsd %xmm7, %xmm1 - ALIGN_3 + ALIGN_3 .L17: testq $1, M @@ -225,7 +225,7 @@ decq I jle .L23 ALIGN_4 - + .L22: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -307,7 +307,7 @@ maxsd %xmm5, %xmm0 maxsd %xmm7, %xmm1 - ALIGN_3 + ALIGN_3 .L27: testq $1, M diff --git a/kernel/x86_64/zamax_sse.S b/kernel/x86_64/zamax_sse.S index 5566a35a3..5f8a1f1c6 100644 --- a/kernel/x86_64/zamax_sse.S +++ b/kernel/x86_64/zamax_sse.S @@ -38,18 +38,18 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #ifdef USE_MIN #define maxps minps #define maxss minss #endif - + #include "l1param.h" PROLOGUE @@ -83,7 +83,7 @@ sarq $3, I jle .L35 ALIGN_4 - + .L31: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -164,7 +164,7 @@ maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L37: testq $1, M @@ -185,7 +185,7 @@ sarq $3, I jle .L45 ALIGN_4 - + .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -256,7 +256,7 @@ andps %xmm15, %xmm6 addps %xmm6, %xmm4 maxps %xmm4, %xmm0 - ALIGN_3 + ALIGN_3 .L46: testq $2, M @@ -277,7 +277,7 @@ maxss %xmm4, %xmm0 maxss %xmm6, %xmm1 ALIGN_3 - + .L47: testq $1, M je .L998 diff --git a/kernel/x86_64/zamax_sse2.S b/kernel/x86_64/zamax_sse2.S index eb8fd4379..bde290be9 100644 --- a/kernel/x86_64/zamax_sse2.S +++ b/kernel/x86_64/zamax_sse2.S @@ -38,13 +38,13 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #ifdef USE_MIN #define maxpd minpd #define maxsd minsd @@ -184,7 +184,7 @@ andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm0 - ALIGN_3 + ALIGN_3 .L37: testq $1, M @@ -205,7 +205,7 @@ sarq $3, I jle .L45 ALIGN_4 - + .L41: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -309,7 +309,7 @@ andpd %xmm15, %xmm5 addpd %xmm5, %xmm4 maxpd %xmm4, %xmm2 - ALIGN_3 + ALIGN_3 .L47: testq $1, M diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S index b94e49bf0..c372fc5dd 100644 --- a/kernel/x86_64/zasum.S +++ b/kernel/x86_64/zasum.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 #define X ARG2 #define INCX ARG3 @@ -68,7 +68,7 @@ sarq $2, I jle .L20 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -132,7 +132,7 @@ sarq $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) fabs diff --git a/kernel/x86_64/zasum_atom.S b/kernel/x86_64/zasum_atom.S index ab83809d2..888dbbb80 100644 --- a/kernel/x86_64/zasum_atom.S +++ b/kernel/x86_64/zasum_atom.S @@ -38,20 +38,20 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ #define I %rax - + #include "l1param.h" PROLOGUE PROFCODE SAVEREGISTERS - + xorps %xmm0, %xmm0 testq M, M @@ -102,7 +102,7 @@ decq I jle .L11 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -257,7 +257,7 @@ pshufd $0x4e, %xmm5, %xmm13 addsd %xmm5, %xmm2 addsd %xmm13, %xmm3 - ALIGN_3 + ALIGN_3 .L14: testq $2, M @@ -270,8 +270,8 @@ pshufd $0x4e, %xmm4, %xmm5 addsd %xmm4, %xmm2 addsd %xmm5, %xmm3 - ALIGN_3 - + ALIGN_3 + .L15: testq $1, M je .L998 @@ -303,7 +303,7 @@ decq I jle .L23 ALIGN_4 - + .L22: andps %xmm15, %xmm4 addq INCX, X @@ -379,7 +379,7 @@ addsd %xmm6, %xmm2 andps %xmm15, %xmm7 addsd %xmm7, %xmm3 - ALIGN_3 + ALIGN_3 .L26: testq $1, M @@ -404,7 +404,7 @@ .L999: RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/zasum_sse.S b/kernel/x86_64/zasum_sse.S index 7f3d3d12d..44d6da561 100644 --- a/kernel/x86_64/zasum_sse.S +++ b/kernel/x86_64/zasum_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -51,7 +51,7 @@ PROFCODE SAVEREGISTERS - + pxor %xmm0, %xmm0 testq M, M jle .L999 @@ -64,7 +64,7 @@ pcmpeqb %xmm15, %xmm15 psrld $1, %xmm15 - + salq $ZBASE_SHIFT, INCX cmpq $2 * SIZE, INCX @@ -116,7 +116,7 @@ decq I jle .L12 ALIGN_3 - + .L11: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -267,7 +267,7 @@ sarq $2, I jle .L105 ALIGN_4 - + .L101: movsd (X), %xmm4 addq INCX, X @@ -314,19 +314,19 @@ #ifndef HAVE_SSE3 movhlps %xmm0, %xmm1 addps %xmm1, %xmm0 - + movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 addss %xmm1, %xmm0 #else haddps %xmm0, %xmm0 haddps %xmm0, %xmm0 -#endif +#endif ALIGN_4 .L999: RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/zasum_sse2.S b/kernel/x86_64/zasum_sse2.S index 9d0ec2e48..d1e076c31 100644 --- a/kernel/x86_64/zasum_sse2.S +++ b/kernel/x86_64/zasum_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -51,7 +51,7 @@ PROFCODE SAVEREGISTERS - + xorps %xmm0, %xmm0 testq M, M jle .L999 @@ -105,7 +105,7 @@ decq I jle .L11 ALIGN_4 - + .L10: #ifdef PREFETCH PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -213,7 +213,7 @@ addpd %xmm5, %xmm1 addq $4 * SIZE, X - ALIGN_3 + ALIGN_3 .L22: testq $2, M @@ -223,7 +223,7 @@ andps %xmm15, %xmm6 addpd %xmm6, %xmm3 addq $2 * SIZE, X - + .L23: testq $1, M je .L998 @@ -243,7 +243,7 @@ sarq $2, I jle .L60 ALIGN_4 - + .L50: #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) prefetcht0 PREFETCHSIZE * SIZE(X) @@ -312,7 +312,7 @@ .L999: RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/zaxpy.S b/kernel/x86_64/zaxpy.S index 266c1477d..1758ca943 100644 --- a/kernel/x86_64/zaxpy.S +++ b/kernel/x86_64/zaxpy.S @@ -68,10 +68,10 @@ salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY - + testq M, M jle .L40 - + cmpq $2 * SIZE, INCX jne .L14 cmpq $2 * SIZE, INCY @@ -332,5 +332,5 @@ ffreep %st(0) ffreep %st(0) ret - + EPILOGUE diff --git a/kernel/x86_64/zaxpy_atom.S b/kernel/x86_64/zaxpy_atom.S index e623326f5..2fe2756fb 100644 --- a/kernel/x86_64/zaxpy_atom.S +++ b/kernel/x86_64/zaxpy_atom.S @@ -70,7 +70,7 @@ #endif #else movaps %xmm3, %xmm0 - movsd 40(%rsp), %xmm1 + movsd 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX @@ -79,7 +79,7 @@ #endif SAVEREGISTERS - + #ifndef CONJ #define ADD1 subsd #define ADD2 addsd @@ -95,7 +95,7 @@ testq M, M jle .L999 - + cmpq $2 * SIZE, INCX jne .L20 cmpq $2 * SIZE, INCY diff --git a/kernel/x86_64/zaxpy_sse.S b/kernel/x86_64/zaxpy_sse.S index 42b920cfb..0a12e244a 100644 --- a/kernel/x86_64/zaxpy_sse.S +++ b/kernel/x86_64/zaxpy_sse.S @@ -67,7 +67,7 @@ movq 8(%rsp), INCY #else movaps %xmm3, %xmm0 - movss 40(%rsp), %xmm1 + movss 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX @@ -76,13 +76,13 @@ #endif SAVEREGISTERS - + salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq M, M jle .L999 - + cmpq $2 * SIZE, INCX jne .L100 cmpq $2 * SIZE, INCY @@ -3113,7 +3113,7 @@ movsd %xmm8, (Y) jmp .L999 ALIGN_3 - + .L200: movq M, %rax cmpq $0, %rax @@ -3142,11 +3142,11 @@ movsd %xmm8, (Y) addq INCY, Y - + decq %rax jg .L201 ALIGN_3 - + .L999: xorq %rax, %rax diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S index 1b7e3a563..a7dd054fb 100644 --- a/kernel/x86_64/zaxpy_sse2.S +++ b/kernel/x86_64/zaxpy_sse2.S @@ -76,7 +76,7 @@ movq 8(%rsp), INCY #else movaps %xmm3, %xmm0 - movsd 40(%rsp), %xmm1 + movsd 40(%rsp), %xmm1 movq 48(%rsp), X movq 56(%rsp), INCX @@ -85,18 +85,18 @@ #endif SAVEREGISTERS - + salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY testq M, M jle .L999 - + cmpq $2 * SIZE, INCX jne .L50 cmpq $2 * SIZE, INCY jne .L50 - + subq $-16 * SIZE, X subq $-16 * SIZE, Y @@ -112,10 +112,10 @@ #endif #ifndef CONJ - shufps $0x0c, %xmm7, %xmm7 + shufps $0x0c, %xmm7, %xmm7 xorpd %xmm7, ALPHA_I #else - shufps $0xc0, %xmm7, %xmm7 + shufps $0xc0, %xmm7, %xmm7 xorpd %xmm7, ALPHA_R #endif @@ -1421,7 +1421,7 @@ je .L58 cmpq $0, INCY je .L58 - + sarq $3, %rax jle .L55 @@ -1775,7 +1775,7 @@ andq $1, %rax jle .L999 -.L58: +.L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) @@ -1788,7 +1788,7 @@ movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) - + decq %rax jg .L58 ALIGN_3 diff --git a/kernel/x86_64/zcopy.S b/kernel/x86_64/zcopy.S index d76426b66..3cc4e185b 100644 --- a/kernel/x86_64/zcopy.S +++ b/kernel/x86_64/zcopy.S @@ -50,7 +50,7 @@ #define INCY %r10 #define FLAG %r11 #endif - + #include "l1param.h" PROLOGUE diff --git a/kernel/x86_64/zcopy_sse.S b/kernel/x86_64/zcopy_sse.S index 91f283aaf..018a56ff2 100644 --- a/kernel/x86_64/zcopy_sse.S +++ b/kernel/x86_64/zcopy_sse.S @@ -65,7 +65,7 @@ #endif SAVEREGISTERS - + salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY @@ -80,7 +80,7 @@ subq $-32 * SIZE, X subq $-32 * SIZE, Y addq M, M - + testq $SIZE, Y je .L05 diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S index f96834708..607b9b93a 100644 --- a/kernel/x86_64/zdot.S +++ b/kernel/x86_64/zdot.S @@ -53,7 +53,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S index e2f153ab3..f53e04cfe 100644 --- a/kernel/x86_64/zdot_sse.S +++ b/kernel/x86_64/zdot_sse.S @@ -92,7 +92,7 @@ movsd -32 * SIZE(X), %xmm4 movsd -32 * SIZE(Y), %xmm0 - pshufd $0xb1, %xmm0, %xmm1 + pshufd $0xb1, %xmm0, %xmm1 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addq $2 * SIZE, X @@ -126,7 +126,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(Y), %xmm8 @@ -134,7 +134,7 @@ movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -12 * SIZE(Y), %xmm9 @@ -146,7 +146,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -8 * SIZE(Y), %xmm10 @@ -154,7 +154,7 @@ movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -4 * SIZE(Y), %xmm11 @@ -166,7 +166,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps 0 * SIZE(Y), %xmm8 @@ -174,7 +174,7 @@ movaps 0 * SIZE(X), %xmm4 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps 4 * SIZE(Y), %xmm9 @@ -186,7 +186,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps 8 * SIZE(Y), %xmm10 @@ -194,7 +194,7 @@ movaps 8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps 12 * SIZE(Y), %xmm11 @@ -210,7 +210,7 @@ ALIGN_3 .L12: - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movaps -16 * SIZE(Y), %xmm8 @@ -218,7 +218,7 @@ movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movaps -12 * SIZE(Y), %xmm9 @@ -226,7 +226,7 @@ movaps -12 * SIZE(X), %xmm5 addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movaps -8 * SIZE(Y), %xmm10 @@ -234,7 +234,7 @@ movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movaps -4 * SIZE(Y), %xmm11 @@ -242,25 +242,25 @@ movaps -4 * SIZE(X), %xmm7 addps %xmm12, %xmm3 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 @@ -277,7 +277,7 @@ movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -286,7 +286,7 @@ movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm9 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 @@ -295,7 +295,7 @@ movaps -24 * SIZE(X), %xmm6 movaps -24 * SIZE(Y), %xmm10 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 @@ -304,7 +304,7 @@ movaps -20 * SIZE(X), %xmm7 movaps -20 * SIZE(Y), %xmm11 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 @@ -323,13 +323,13 @@ movaps -28 * SIZE(X), %xmm5 movaps -28 * SIZE(Y), %xmm9 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 @@ -346,7 +346,7 @@ movaps -32 * SIZE(X), %xmm4 movaps -32 * SIZE(Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -369,7 +369,7 @@ #endif movsd -32 * SIZE(Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -410,7 +410,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -420,7 +420,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -434,7 +434,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -444,7 +444,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -458,7 +458,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -468,7 +468,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -482,7 +482,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -492,7 +492,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -510,7 +510,7 @@ .L22: movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -520,7 +520,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -530,7 +530,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -540,7 +540,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -550,7 +550,7 @@ addps %xmm12, %xmm1 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -559,7 +559,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -567,7 +567,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -575,7 +575,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -596,7 +596,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -607,7 +607,7 @@ movaps -24 * SIZE(Y), %xmm11 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -618,7 +618,7 @@ movaps -20 * SIZE(Y), %xmm8 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -626,7 +626,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -645,7 +645,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -656,7 +656,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -677,7 +677,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -698,7 +698,7 @@ #endif movsd -32 * SIZE(X), %xmm4 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x59, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -745,7 +745,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd -16 * SIZE(Y), %xmm8 @@ -754,7 +754,7 @@ movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd -12 * SIZE(Y), %xmm9 @@ -767,7 +767,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd -8 * SIZE(Y), %xmm10 @@ -776,7 +776,7 @@ movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd -4 * SIZE(Y), %xmm11 @@ -789,7 +789,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd 0 * SIZE(Y), %xmm8 @@ -798,7 +798,7 @@ movaps 0 * SIZE(X), %xmm4 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd 4 * SIZE(Y), %xmm9 @@ -811,7 +811,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd 8 * SIZE(Y), %xmm10 @@ -820,7 +820,7 @@ movaps 8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd 12 * SIZE(Y), %xmm11 @@ -837,7 +837,7 @@ ALIGN_3 .L32: - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd -16 * SIZE(Y), %xmm8 @@ -846,7 +846,7 @@ movaps -16 * SIZE(X), %xmm4 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd -12 * SIZE(Y), %xmm9 @@ -855,7 +855,7 @@ movaps -12 * SIZE(X), %xmm5 addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd -8 * SIZE(Y), %xmm10 @@ -864,7 +864,7 @@ movaps -8 * SIZE(X), %xmm6 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd -4 * SIZE(Y), %xmm11 @@ -873,25 +873,25 @@ movaps -4 * SIZE(X), %xmm7 addps %xmm12, %xmm3 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 @@ -909,7 +909,7 @@ movsd -32 * SIZE(Y), %xmm8 movhps -30 * SIZE(Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -919,7 +919,7 @@ movsd -28 * SIZE(Y), %xmm9 movhps -26 * SIZE(Y), %xmm9 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 @@ -929,7 +929,7 @@ movsd -24 * SIZE(Y), %xmm10 movhps -22 * SIZE(Y), %xmm10 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 @@ -939,7 +939,7 @@ movsd -20 * SIZE(Y), %xmm11 movhps -18 * SIZE(Y), %xmm11 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 @@ -957,7 +957,7 @@ movsd -32 * SIZE(Y), %xmm8 movhps -30 * SIZE(Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -967,7 +967,7 @@ movsd -28 * SIZE(Y), %xmm9 movhps -26 * SIZE(Y), %xmm9 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 @@ -985,7 +985,7 @@ movsd -32 * SIZE(Y), %xmm8 movhps -30 * SIZE(Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -1008,7 +1008,7 @@ #endif movsd -32 * SIZE(Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -1045,7 +1045,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1055,7 +1055,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1069,7 +1069,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1079,7 +1079,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1093,7 +1093,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1103,7 +1103,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1117,7 +1117,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1127,7 +1127,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1145,7 +1145,7 @@ .L42: movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1155,7 +1155,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1165,7 +1165,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1175,7 +1175,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1185,7 +1185,7 @@ addps %xmm12, %xmm1 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1194,7 +1194,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1202,7 +1202,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1210,7 +1210,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1231,7 +1231,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1242,7 +1242,7 @@ movaps -24 * SIZE(Y), %xmm11 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1253,7 +1253,7 @@ movaps -20 * SIZE(Y), %xmm8 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1261,7 +1261,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1280,7 +1280,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1291,7 +1291,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1312,7 +1312,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1335,7 +1335,7 @@ movss -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1368,7 +1368,7 @@ #endif movsd -32 * SIZE(Y), %xmm4 - pshufd $0xb1, %xmm0, %xmm1 + pshufd $0xb1, %xmm0, %xmm1 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addq $2 * SIZE, X @@ -1408,7 +1408,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1418,7 +1418,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1432,7 +1432,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1442,7 +1442,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1456,7 +1456,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1466,7 +1466,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1480,7 +1480,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1490,7 +1490,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1508,7 +1508,7 @@ .L52: movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1518,7 +1518,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1528,7 +1528,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1538,7 +1538,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1548,7 +1548,7 @@ addps %xmm12, %xmm1 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1557,7 +1557,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1565,7 +1565,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1573,7 +1573,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1594,7 +1594,7 @@ movaps -28 * SIZE(X), %xmm10 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1605,7 +1605,7 @@ movaps -24 * SIZE(X), %xmm11 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1616,7 +1616,7 @@ movaps -20 * SIZE(X), %xmm8 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x39, %xmm10, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1624,7 +1624,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x39, %xmm11, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1643,7 +1643,7 @@ movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1654,7 +1654,7 @@ movaps -28 * SIZE(X), %xmm10 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x39, %xmm9, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1675,7 +1675,7 @@ movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x39, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1696,7 +1696,7 @@ #endif movsd -32 * SIZE(Y), %xmm4 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0xa9, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1733,7 +1733,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1743,7 +1743,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1757,7 +1757,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1767,7 +1767,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1781,7 +1781,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1791,7 +1791,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1805,7 +1805,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1815,7 +1815,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1833,7 +1833,7 @@ .L62: movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1843,7 +1843,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1853,7 +1853,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1863,7 +1863,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1873,7 +1873,7 @@ addps %xmm12, %xmm1 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1882,7 +1882,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1890,7 +1890,7 @@ addps %xmm12, %xmm1 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1898,7 +1898,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1919,7 +1919,7 @@ movaps -28 * SIZE(X), %xmm10 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1930,7 +1930,7 @@ movaps -24 * SIZE(X), %xmm11 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -1941,7 +1941,7 @@ movaps -20 * SIZE(X), %xmm8 movss %xmm11, %xmm10 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 shufps $0x93, %xmm11, %xmm10 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 @@ -1949,7 +1949,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 shufps $0x93, %xmm8, %xmm11 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 @@ -1968,7 +1968,7 @@ movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -1979,7 +1979,7 @@ movaps -28 * SIZE(X), %xmm10 movss %xmm10, %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 shufps $0x93, %xmm10, %xmm9 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 @@ -2000,7 +2000,7 @@ movaps -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x93, %xmm9, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -2023,7 +2023,7 @@ movss -32 * SIZE(X), %xmm9 movss %xmm9, %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 @@ -2046,7 +2046,7 @@ #endif movsd -32 * SIZE(X), %xmm4 - pshufd $0xb1, %xmm0, %xmm1 + pshufd $0xb1, %xmm0, %xmm1 mulps %xmm4, %xmm0 mulps %xmm4, %xmm1 addq $2 * SIZE, X @@ -2095,7 +2095,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 @@ -2104,7 +2104,7 @@ movhps -14 * SIZE(X), %xmm8 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 @@ -2113,7 +2113,7 @@ movhps -10 * SIZE(X), %xmm9 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 @@ -2122,7 +2122,7 @@ movhps -6 * SIZE(X), %xmm10 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 @@ -2131,7 +2131,7 @@ movhps -2 * SIZE(X), %xmm11 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps 0 * SIZE(Y), %xmm4 @@ -2140,7 +2140,7 @@ movhps 2 * SIZE(X), %xmm8 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps 4 * SIZE(Y), %xmm5 @@ -2149,7 +2149,7 @@ movhps 6 * SIZE(X), %xmm9 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps 8 * SIZE(Y), %xmm6 @@ -2158,7 +2158,7 @@ movhps 10 * SIZE(X), %xmm10 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps 12 * SIZE(Y), %xmm7 @@ -2175,7 +2175,7 @@ ALIGN_3 .L52: - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 movaps -16 * SIZE(Y), %xmm4 @@ -2184,7 +2184,7 @@ movhps -14 * SIZE(X), %xmm8 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 movaps -12 * SIZE(Y), %xmm5 @@ -2193,7 +2193,7 @@ movhps -10 * SIZE(X), %xmm9 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 movaps -8 * SIZE(Y), %xmm6 @@ -2202,7 +2202,7 @@ movhps -6 * SIZE(X), %xmm10 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 movaps -4 * SIZE(Y), %xmm7 @@ -2211,25 +2211,25 @@ movhps -2 * SIZE(X), %xmm11 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 @@ -2251,7 +2251,7 @@ movlps -28 * SIZE(X), %xmm9 movhps -26 * SIZE(X), %xmm9 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2261,7 +2261,7 @@ movlps -24 * SIZE(X), %xmm10 movhps -22 * SIZE(X), %xmm10 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 @@ -2271,13 +2271,13 @@ movlps -20 * SIZE(X), %xmm11 movhps -18 * SIZE(X), %xmm11 - pshufd $0xb1, %xmm6, %xmm12 + pshufd $0xb1, %xmm6, %xmm12 mulps %xmm10, %xmm6 addps %xmm6, %xmm0 mulps %xmm10, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm7, %xmm12 + pshufd $0xb1, %xmm7, %xmm12 mulps %xmm11, %xmm7 addps %xmm7, %xmm0 mulps %xmm11, %xmm12 @@ -2295,7 +2295,7 @@ movlps -32 * SIZE(X), %xmm8 movhps -30 * SIZE(X), %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2305,7 +2305,7 @@ movlps -28 * SIZE(X), %xmm9 movhps -26 * SIZE(X), %xmm9 - pshufd $0xb1, %xmm5, %xmm12 + pshufd $0xb1, %xmm5, %xmm12 mulps %xmm9, %xmm5 addps %xmm5, %xmm0 mulps %xmm9, %xmm12 @@ -2323,7 +2323,7 @@ movlps -32 * SIZE(X), %xmm8 movhps -30 * SIZE(X), %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2347,7 +2347,7 @@ #endif movsd -32 * SIZE(X), %xmm8 - pshufd $0xb1, %xmm4, %xmm12 + pshufd $0xb1, %xmm4, %xmm12 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2409,7 +2409,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2419,7 +2419,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2433,7 +2433,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2443,7 +2443,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2457,7 +2457,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2467,7 +2467,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2481,7 +2481,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2491,7 +2491,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2509,7 +2509,7 @@ .L72: movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2519,7 +2519,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2529,7 +2529,7 @@ addps %xmm12, %xmm3 movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2539,7 +2539,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2549,7 +2549,7 @@ addps %xmm12, %xmm3 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2559,7 +2559,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2567,7 +2567,7 @@ addps %xmm12, %xmm3 movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2575,7 +2575,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2594,7 +2594,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2605,7 +2605,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2616,7 +2616,7 @@ movaps -24 * SIZE(Y), %xmm11 movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2627,7 +2627,7 @@ movaps -20 * SIZE(Y), %xmm8 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2648,7 +2648,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2656,7 +2656,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2678,7 +2678,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2698,7 +2698,7 @@ movss %xmm5, %xmm8 shufps $0x24, %xmm4, %xmm4 - pshufd $0x18, %xmm8, %xmm12 + pshufd $0x18, %xmm8, %xmm12 shufps $0x24, %xmm8, %xmm8 mulps %xmm4, %xmm8 @@ -2748,7 +2748,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2759,7 +2759,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2774,7 +2774,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2785,7 +2785,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2800,7 +2800,7 @@ #endif movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2811,7 +2811,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2826,7 +2826,7 @@ #endif movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2837,7 +2837,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2856,7 +2856,7 @@ .L82: movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2867,7 +2867,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2878,7 +2878,7 @@ addps %xmm12, %xmm3 movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2889,7 +2889,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2900,7 +2900,7 @@ addps %xmm12, %xmm3 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2911,7 +2911,7 @@ addps %xmm12, %xmm1 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2919,7 +2919,7 @@ addps %xmm12, %xmm3 movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2927,7 +2927,7 @@ addps %xmm12, %xmm1 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -2947,7 +2947,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -2959,7 +2959,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -2971,7 +2971,7 @@ movaps -24 * SIZE(Y), %xmm11 movss %xmm11, %xmm10 - pshufd $0x1b, %xmm10, %xmm12 + pshufd $0x1b, %xmm10, %xmm12 movss %xmm7, %xmm6 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 @@ -2983,7 +2983,7 @@ movaps -20 * SIZE(Y), %xmm8 movss %xmm8, %xmm11 - pshufd $0x1b, %xmm11, %xmm12 + pshufd $0x1b, %xmm11, %xmm12 movss %xmm4, %xmm7 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 @@ -3003,7 +3003,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -3015,7 +3015,7 @@ movaps -28 * SIZE(Y), %xmm10 movss %xmm10, %xmm9 - pshufd $0x1b, %xmm9, %xmm12 + pshufd $0x1b, %xmm9, %xmm12 movss %xmm6, %xmm5 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 @@ -3038,7 +3038,7 @@ movaps -32 * SIZE(Y), %xmm9 movss %xmm9, %xmm8 - pshufd $0x1b, %xmm8, %xmm12 + pshufd $0x1b, %xmm8, %xmm12 movss %xmm5, %xmm4 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 @@ -3058,7 +3058,7 @@ movss %xmm5, %xmm8 shufps $0x24, %xmm4, %xmm4 - pshufd $0x18, %xmm8, %xmm12 + pshufd $0x18, %xmm8, %xmm12 shufps $0x24, %xmm8, %xmm8 mulps %xmm4, %xmm8 @@ -3121,7 +3121,7 @@ ALIGN_3 .L203: - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd (Y), %xmm8 @@ -3135,7 +3135,7 @@ addq INCX, X addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd (Y), %xmm9 @@ -3149,7 +3149,7 @@ addq INCX, X addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd (Y), %xmm10 @@ -3163,7 +3163,7 @@ addq INCX, X addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd (Y), %xmm11 @@ -3177,7 +3177,7 @@ addq INCX, X addps %xmm12, %xmm3 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd (Y), %xmm8 @@ -3191,7 +3191,7 @@ addq INCX, X addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd (Y), %xmm9 @@ -3205,7 +3205,7 @@ addq INCX, X addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd (Y), %xmm10 @@ -3219,7 +3219,7 @@ addq INCX, X addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd (Y), %xmm11 @@ -3239,7 +3239,7 @@ ALIGN_3 .L204: - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 movsd (Y), %xmm8 @@ -3253,7 +3253,7 @@ addq INCX, X addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 movsd (Y), %xmm9 @@ -3267,7 +3267,7 @@ addq INCX, X addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 movsd (Y), %xmm10 @@ -3281,7 +3281,7 @@ addq INCX, X addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 movsd (Y), %xmm11 @@ -3295,25 +3295,25 @@ addq INCX, X addps %xmm12, %xmm3 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 addps %xmm12, %xmm3 - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 addps %xmm12, %xmm1 - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 @@ -3333,7 +3333,7 @@ movhps (Y), %xmm8 addq INCY, Y - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -3348,7 +3348,7 @@ movhps (Y), %xmm9 addq INCY, Y - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 @@ -3363,7 +3363,7 @@ movhps (Y), %xmm10 addq INCY, Y - pshufd $0xb1, %xmm10, %xmm12 + pshufd $0xb1, %xmm10, %xmm12 mulps %xmm6, %xmm10 addps %xmm10, %xmm0 mulps %xmm6, %xmm12 @@ -3378,7 +3378,7 @@ movhps (Y), %xmm11 addq INCY, Y - pshufd $0xb1, %xmm11, %xmm12 + pshufd $0xb1, %xmm11, %xmm12 mulps %xmm7, %xmm11 addps %xmm11, %xmm2 mulps %xmm7, %xmm12 @@ -3398,7 +3398,7 @@ movhps (Y), %xmm8 addq INCY, Y - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -3413,7 +3413,7 @@ movhps (Y), %xmm9 addq INCY, Y - pshufd $0xb1, %xmm9, %xmm12 + pshufd $0xb1, %xmm9, %xmm12 mulps %xmm5, %xmm9 addps %xmm9, %xmm2 mulps %xmm5, %xmm12 @@ -3433,7 +3433,7 @@ movhps (Y), %xmm8 addq INCY, Y - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -3453,7 +3453,7 @@ #endif movsd (Y), %xmm8 - pshufd $0xb1, %xmm8, %xmm12 + pshufd $0xb1, %xmm8, %xmm12 mulps %xmm4, %xmm8 addps %xmm8, %xmm0 mulps %xmm4, %xmm12 @@ -3483,7 +3483,7 @@ subss %xmm3, %xmm1 #endif unpcklps %xmm1, %xmm0 - + #ifdef WINDOWS_ABI movq %xmm0, %rax #endif diff --git a/kernel/x86_64/zdot_sse2.S b/kernel/x86_64/zdot_sse2.S index 63acecc08..3ab6f4517 100644 --- a/kernel/x86_64/zdot_sse2.S +++ b/kernel/x86_64/zdot_sse2.S @@ -50,7 +50,7 @@ #define N ARG2 /* rdx */ #define X ARG3 /* r8 */ #define INCX ARG4 /* r9*/ -#define Y %r10 +#define Y %r10 #define INCY %r11 #endif @@ -64,7 +64,7 @@ #define MOVLPS movlps #endif - + PROLOGUE PROFCODE @@ -122,7 +122,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 @@ -130,7 +130,7 @@ movaps -8 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 @@ -142,7 +142,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 @@ -150,7 +150,7 @@ movaps -4 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 @@ -162,7 +162,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps 0 * SIZE(Y), %xmm8 @@ -170,7 +170,7 @@ movaps 0 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps 2 * SIZE(Y), %xmm9 @@ -182,7 +182,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps 4 * SIZE(Y), %xmm10 @@ -190,7 +190,7 @@ movaps 4 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps 6 * SIZE(Y), %xmm11 @@ -206,7 +206,7 @@ ALIGN_3 .L12: - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 @@ -214,7 +214,7 @@ movaps -8 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 @@ -222,7 +222,7 @@ movaps -6 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 @@ -230,7 +230,7 @@ movaps -4 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 @@ -238,25 +238,25 @@ movaps -2 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -275,13 +275,13 @@ movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -292,13 +292,13 @@ movaps -10 * SIZE(X), %xmm7 movaps -10 * SIZE(Y), %xmm11 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -317,13 +317,13 @@ movaps -14 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -340,7 +340,7 @@ movaps -16 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -375,7 +375,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 @@ -384,7 +384,7 @@ movhps -7 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 @@ -397,7 +397,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 @@ -406,7 +406,7 @@ movhps -3 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 @@ -419,7 +419,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps 0 * SIZE(Y), %xmm8 @@ -428,7 +428,7 @@ movhps 1 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps 2 * SIZE(Y), %xmm9 @@ -441,7 +441,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps 4 * SIZE(Y), %xmm10 @@ -450,7 +450,7 @@ movhps 5 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps 6 * SIZE(Y), %xmm11 @@ -468,7 +468,7 @@ .L22: - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(Y), %xmm8 @@ -477,7 +477,7 @@ movhps -7 * SIZE(X), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(Y), %xmm9 @@ -486,7 +486,7 @@ movhps -5 * SIZE(X), %xmm5 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(Y), %xmm10 @@ -495,7 +495,7 @@ movhps -3 * SIZE(X), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(Y), %xmm11 @@ -504,25 +504,25 @@ movhps -1 * SIZE(X), %xmm7 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -540,7 +540,7 @@ movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -550,7 +550,7 @@ movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -560,7 +560,7 @@ movhps -11 * SIZE(X), %xmm6 movaps -12 * SIZE(Y), %xmm10 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 @@ -570,7 +570,7 @@ movhps -9 * SIZE(X), %xmm7 movaps -10 * SIZE(Y), %xmm11 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -588,7 +588,7 @@ movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -598,7 +598,7 @@ movhps -13 * SIZE(X), %xmm5 movaps -14 * SIZE(Y), %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -616,7 +616,7 @@ movhps -15 * SIZE(X), %xmm4 movaps -16 * SIZE(Y), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -654,7 +654,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(X), %xmm8 @@ -663,7 +663,7 @@ movhps -7 * SIZE(Y), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(X), %xmm9 @@ -676,7 +676,7 @@ PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(X), %xmm10 @@ -685,7 +685,7 @@ movhps -3 * SIZE(Y), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(X), %xmm11 @@ -698,7 +698,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) #endif - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps 0 * SIZE(X), %xmm8 @@ -707,7 +707,7 @@ movhps 1 * SIZE(Y), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps 2 * SIZE(X), %xmm9 @@ -720,7 +720,7 @@ PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) #endif - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps 4 * SIZE(X), %xmm10 @@ -729,7 +729,7 @@ movhps 5 * SIZE(Y), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps 6 * SIZE(X), %xmm11 @@ -747,7 +747,7 @@ .L32: - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 movaps -8 * SIZE(X), %xmm8 @@ -756,7 +756,7 @@ movhps -7 * SIZE(Y), %xmm4 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 movaps -6 * SIZE(X), %xmm9 @@ -765,7 +765,7 @@ movhps -5 * SIZE(Y), %xmm5 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 movaps -4 * SIZE(X), %xmm10 @@ -774,7 +774,7 @@ movhps -3 * SIZE(Y), %xmm6 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 movaps -2 * SIZE(X), %xmm11 @@ -783,25 +783,25 @@ movhps -1 * SIZE(Y), %xmm7 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -819,7 +819,7 @@ movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -829,7 +829,7 @@ movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -839,7 +839,7 @@ movhps -11 * SIZE(Y), %xmm6 movaps -12 * SIZE(X), %xmm10 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 @@ -849,7 +849,7 @@ movhps -9 * SIZE(Y), %xmm7 movaps -10 * SIZE(X), %xmm11 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -867,7 +867,7 @@ movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -877,7 +877,7 @@ movhps -13 * SIZE(Y), %xmm5 movaps -14 * SIZE(X), %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -898,7 +898,7 @@ movhps -15 * SIZE(Y), %xmm4 movaps -16 * SIZE(X), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -933,7 +933,7 @@ #endif movsd %xmm9, %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 @@ -943,7 +943,7 @@ addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 @@ -957,7 +957,7 @@ #endif movsd %xmm11, %xmm10 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 @@ -967,7 +967,7 @@ addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 @@ -981,7 +981,7 @@ #endif movsd %xmm9, %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 @@ -991,7 +991,7 @@ addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 @@ -1005,7 +1005,7 @@ #endif movsd %xmm11, %xmm10 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 @@ -1015,7 +1015,7 @@ addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 @@ -1033,7 +1033,7 @@ .L42: movsd %xmm9, %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 @@ -1043,7 +1043,7 @@ addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 @@ -1053,7 +1053,7 @@ addpd %xmm12, %xmm1 movsd %xmm11, %xmm10 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 @@ -1063,7 +1063,7 @@ addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 @@ -1073,7 +1073,7 @@ addpd %xmm12, %xmm1 movsd %xmm9, %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 @@ -1083,7 +1083,7 @@ addpd %xmm12, %xmm1 movsd %xmm10, %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 @@ -1091,7 +1091,7 @@ addpd %xmm12, %xmm1 movsd %xmm11, %xmm10 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 @@ -1099,7 +1099,7 @@ addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 @@ -1120,7 +1120,7 @@ movaps -14 * SIZE(Y), %xmm10 movsd %xmm9, %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 @@ -1131,7 +1131,7 @@ movaps -12 * SIZE(Y), %xmm11 movsd %xmm10, %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 @@ -1142,7 +1142,7 @@ movaps -10 * SIZE(Y), %xmm8 movsd %xmm11, %xmm10 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 movsd %xmm7, %xmm6 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 @@ -1150,7 +1150,7 @@ addpd %xmm12, %xmm1 movsd %xmm8, %xmm11 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 movsd %xmm4, %xmm7 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm0 @@ -1169,7 +1169,7 @@ movaps -16 * SIZE(Y), %xmm9 movsd %xmm9, %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 movsd %xmm5, %xmm4 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 @@ -1180,7 +1180,7 @@ movaps -14 * SIZE(Y), %xmm10 movsd %xmm10, %xmm9 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 movsd %xmm6, %xmm5 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm0 @@ -1201,7 +1201,7 @@ movlps -16 * SIZE(X), %xmm4 movlps -16 * SIZE(Y), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -1213,7 +1213,7 @@ SHUFPD_1 %xmm1, %xmm1 SHUFPD_1 %xmm2, %xmm2 SHUFPD_1 %xmm3, %xmm3 - jmp .L98 + jmp .L98 ALIGN_3 .L50: @@ -1254,7 +1254,7 @@ ALIGN_3 .L53: - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVLPS 0 * SIZE(Y), %xmm8 @@ -1266,7 +1266,7 @@ addq INCX, X addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 MOVLPS 0 * SIZE(Y), %xmm9 @@ -1278,7 +1278,7 @@ addq INCX, X addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 MOVLPS 0 * SIZE(Y), %xmm10 @@ -1290,7 +1290,7 @@ addq INCX, X addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 MOVLPS 0 * SIZE(Y), %xmm11 @@ -1302,7 +1302,7 @@ addq INCX, X addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVLPS 0 * SIZE(Y), %xmm8 @@ -1315,7 +1315,7 @@ addq INCX, X addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 MOVLPS 0 * SIZE(Y), %xmm9 @@ -1328,7 +1328,7 @@ addq INCX, X addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 MOVLPS 0 * SIZE(Y), %xmm10 @@ -1340,7 +1340,7 @@ addq INCX, X addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 MOVLPS 0 * SIZE(Y), %xmm11 @@ -1357,7 +1357,7 @@ ALIGN_3 .L54: - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 MOVLPS 0 * SIZE(Y), %xmm8 @@ -1369,7 +1369,7 @@ addq INCX, X addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 MOVLPS 0 * SIZE(Y), %xmm9 @@ -1381,7 +1381,7 @@ addq INCX, X addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 MOVLPS 0 * SIZE(Y), %xmm10 @@ -1393,7 +1393,7 @@ addq INCX, X addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 MOVLPS 0 * SIZE(Y), %xmm11 @@ -1405,25 +1405,25 @@ addq INCX, X addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 addpd %xmm12, %xmm3 - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 addpd %xmm12, %xmm1 - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -1441,7 +1441,7 @@ movhps 1 * SIZE(Y), %xmm8 addq INCY, Y - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -1454,7 +1454,7 @@ movhps 1 * SIZE(Y), %xmm9 addq INCY, Y - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -1467,7 +1467,7 @@ movhps 1 * SIZE(Y), %xmm10 addq INCY, Y - pshufd $0x4e, %xmm10, %xmm12 + pshufd $0x4e, %xmm10, %xmm12 mulpd %xmm6, %xmm10 addpd %xmm10, %xmm0 mulpd %xmm6, %xmm12 @@ -1480,7 +1480,7 @@ movhps 1 * SIZE(Y), %xmm11 addq INCY, Y - pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm11, %xmm12 mulpd %xmm7, %xmm11 addpd %xmm11, %xmm2 mulpd %xmm7, %xmm12 @@ -1498,7 +1498,7 @@ movhps 1 * SIZE(Y), %xmm8 addq INCY, Y - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 @@ -1511,7 +1511,7 @@ movhps 1 * SIZE(Y), %xmm9 addq INCY, Y - pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm9, %xmm12 mulpd %xmm5, %xmm9 addpd %xmm9, %xmm2 mulpd %xmm5, %xmm12 @@ -1527,7 +1527,7 @@ MOVLPS 0 * SIZE(Y), %xmm8 movhps 1 * SIZE(Y), %xmm8 - pshufd $0x4e, %xmm8, %xmm12 + pshufd $0x4e, %xmm8, %xmm12 mulpd %xmm4, %xmm8 addpd %xmm8, %xmm0 mulpd %xmm4, %xmm12 diff --git a/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S index 97eb1ec7a..0069066ad 100644 --- a/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S +++ b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -103,7 +103,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -148,7 +148,7 @@ movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I - + subq $-16 * SIZE, A subq $-16 * SIZE, B @@ -162,7 +162,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -205,7 +205,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif prefetcht0 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -241,7 +241,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -256,7 +256,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm12 @@ -606,7 +606,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -626,7 +626,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -640,7 +640,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -655,7 +655,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -830,7 +830,7 @@ movhps %xmm2, 1 * SIZE(CO2, LDC, 2) movlps %xmm3, 0 * SIZE(CO2, %rax) movhps %xmm3, 1 * SIZE(CO2, %rax) - ALIGN_4 + ALIGN_4 .L29: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -879,7 +879,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -901,7 +901,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -916,7 +916,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1120,7 +1120,7 @@ decq I BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1140,7 +1140,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1154,7 +1154,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1169,7 +1169,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: mulpd %xmm0, %xmm1 addpd %xmm1, %xmm8 movaps -14 * SIZE(BO), %xmm1 @@ -1279,7 +1279,7 @@ movhps %xmm2, 1 * SIZE(CO2) movlps %xmm3, 0 * SIZE(CO2, LDC) movhps %xmm3, 1 * SIZE(CO2, LDC) - ALIGN_4 + ALIGN_4 .L49: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1324,7 +1324,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1342,7 +1342,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1357,7 +1357,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1482,7 +1482,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1502,7 +1502,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1514,7 +1514,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1529,7 +1529,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: mulpd %xmm0, %xmm1 movddup -15 * SIZE(AO), %xmm0 addpd %xmm1, %xmm8 @@ -1605,7 +1605,7 @@ movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) - ALIGN_4 + ALIGN_4 .L69: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1649,7 +1649,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1666,7 +1666,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1681,7 +1681,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm1, %xmm8 @@ -1766,7 +1766,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $1, M @@ -1786,7 +1786,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif #ifndef TRMMKERNEL movaps -16 * SIZE(AO), %xmm0 @@ -1822,7 +1822,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: mulpd %xmm0, %xmm1 #ifndef TRMMKERNEL movapd -14 * SIZE(AO), %xmm0 @@ -1902,7 +1902,7 @@ movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) - ALIGN_4 + ALIGN_4 .L999: movq 0(%rsp), %rbx diff --git a/kernel/x86_64/zgemm3m_kernel_4x2_atom.S b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S index 189505dd3..1049c0131 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x2_atom.S +++ b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -96,7 +96,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -136,14 +136,14 @@ movsd %xmm0, ALPHA_R movsd %xmm1, ALPHA_I - + salq $ZBASE_SHIFT, LDC movq N, J sarq $1, J jle .L40 ALIGN_4 - + .L10: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -154,7 +154,7 @@ movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB - + movq M, I sarq $2, I jle .L20 @@ -202,7 +202,7 @@ addsd %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + addsd %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 @@ -771,7 +771,7 @@ movsd %xmm9, 0 * SIZE(CO2) movsd %xmm11, 1 * SIZE(CO2) ALIGN_4 - + .L39: movq BO, B decq J # j -- @@ -1185,7 +1185,7 @@ movsd %xmm8, 0 * SIZE(CO1) movsd %xmm9, 1 * SIZE(CO1) ALIGN_4 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S index 4199bd91e..76ed76d81 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S +++ b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define BUFFERED #define OLD_M %rdi @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -305,7 +305,7 @@ movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ movapd %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulpd %xmm1, %xmm0 ;\ mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -416,7 +416,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) @@ -475,7 +475,7 @@ movsd %xmm0, 0 + ALPHA movsd %xmm1, 8 + ALPHA - + salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL @@ -483,7 +483,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq N, J sarq $2, J # j = (n >> 2) @@ -496,18 +496,18 @@ #endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif #ifdef BUFFERED movq K, %rax sarq $2, %rax jle .L03 ALIGN_3 - + .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -567,7 +567,7 @@ subq $1, %rax jne .L04 ALIGN_4 - + .L10: #endif movq A, AO # aoffset = a @@ -594,7 +594,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif prefetch (RPREFETCHSIZE + 0) * SIZE(BB) prefetch (RPREFETCHSIZE + 8) * SIZE(BB) @@ -626,7 +626,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -831,7 +831,7 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 @@ -866,7 +866,7 @@ movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 @@ -901,7 +901,7 @@ movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) movsd %xmm3, 6 * SIZE(CO1, LDC, 2) movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 @@ -943,7 +943,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $3, M @@ -969,7 +969,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -986,7 +986,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1155,7 +1155,7 @@ movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 @@ -1176,7 +1176,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1197,7 +1197,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1214,7 +1214,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1328,7 +1328,7 @@ movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 @@ -1339,8 +1339,8 @@ movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) - ALIGN_4 - + ALIGN_4 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1367,18 +1367,18 @@ #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #endif - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif #ifdef BUFFERED movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 - + .L42: prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) @@ -1419,7 +1419,7 @@ subq $1, %rax jne .L44 ALIGN_4 - + .L50: #endif movq C, CO1 # coffset1 = c @@ -1447,7 +1447,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 @@ -1467,7 +1467,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1612,7 +1612,7 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 @@ -1653,7 +1653,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1676,7 +1676,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1692,7 +1692,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1780,7 +1780,7 @@ .L69: addpd %xmm10, %xmm8 addpd %xmm11, %xmm9 - + movsd 0 * SIZE(CO1), %xmm0 movhpd 1 * SIZE(CO1), %xmm0 movsd 2 * SIZE(CO1), %xmm1 @@ -1819,7 +1819,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1842,7 +1842,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1858,7 +1858,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1950,8 +1950,8 @@ movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) - ALIGN_4 - + ALIGN_4 + .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -1973,18 +1973,18 @@ #ifdef BUFFERED leaq 16 * SIZE + BUFFER, BO #endif - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif #ifdef BUFFERED movq K, %rax sarq $3, %rax jle .L83 ALIGN_4 - + .L82: prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) @@ -2025,7 +2025,7 @@ decq %rax jne .L84 ALIGN_4 - + .L90: #endif movq C, CO1 # coffset1 = c @@ -2052,7 +2052,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif movapd -8 * SIZE(AO), %xmm2 pxor %xmm8, %xmm8 @@ -2071,7 +2071,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2189,12 +2189,12 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -2217,7 +2217,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm0 pxor %xmm8, %xmm8 @@ -2233,7 +2233,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2325,7 +2325,7 @@ movhpd %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -2348,7 +2348,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2360,7 +2360,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_core2.S b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S index 1b466fb19..a78890dea 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x4_core2.S +++ b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -95,7 +95,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -145,7 +145,7 @@ movsd %xmm0, 0 + ALPHA movsd %xmm1, 8 + ALPHA - + subq $-16 * SIZE, A subq $-16 * SIZE, B @@ -159,7 +159,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq N, J @@ -171,18 +171,18 @@ .L01: /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $2, %rax NOBRANCH jle .L05 ALIGN_4 - + .L02: movapd -16 * SIZE(B), %xmm0 prefetchnta (PREFETCH_R + 0) * SIZE(B) @@ -213,7 +213,7 @@ unpckhpd %xmm6, %xmm6 movddup %xmm7, %xmm15 unpckhpd %xmm7, %xmm7 - + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) movapd %xmm8, -16 * SIZE(BO) movapd %xmm0, -14 * SIZE(BO) @@ -271,7 +271,7 @@ BRANCH jne .L06 ALIGN_4 - + .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -297,7 +297,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -332,7 +332,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -347,7 +347,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PADDING; addpd %xmm2, %xmm10 movaps -16 * SIZE(BO), %xmm2 @@ -572,7 +572,7 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 @@ -607,7 +607,7 @@ movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 @@ -642,7 +642,7 @@ movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) movsd %xmm3, 6 * SIZE(CO1, LDC, 2) movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 @@ -683,7 +683,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -701,7 +701,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 @@ -719,7 +719,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -734,7 +734,7 @@ jle .L25 ALIGN_4 -.L21: +.L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 movapd -16 * SIZE(BO), %xmm2 @@ -893,7 +893,7 @@ movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 @@ -915,7 +915,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 - + .L30: testq $1, M BRANCH @@ -932,7 +932,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 movsd -16 * SIZE(AO), %xmm0 @@ -951,7 +951,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -966,7 +966,7 @@ jle .L35 ALIGN_4 -.L31: +.L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 movsd -16 * SIZE(BO), %xmm2 @@ -1101,7 +1101,7 @@ movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 @@ -1133,11 +1133,11 @@ .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $3, %rax @@ -1145,7 +1145,7 @@ addq %rax, %rax ALIGN_4 - + .L42: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -1191,7 +1191,7 @@ subq $1, %rax jne .L44 ALIGN_4 - + .L45: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -1214,7 +1214,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1238,7 +1238,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1252,7 +1252,7 @@ jle .L55 ALIGN_4 -.L51: +.L51: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 @@ -1398,7 +1398,7 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 @@ -1438,7 +1438,7 @@ addq $8 * SIZE, CO2 subq $1, I jg .L50 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1455,7 +1455,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(AO), %xmm0 @@ -1475,7 +1475,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1489,7 +1489,7 @@ jle .L65 ALIGN_4 -.L61: +.L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm2, %xmm8 @@ -1604,7 +1604,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 ALIGN_4 - + .L70: testq $1, M jle .L79 @@ -1620,7 +1620,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movsd -15 * SIZE(AO), %xmm1 @@ -1639,7 +1639,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1653,7 +1653,7 @@ jle .L75 ALIGN_4 -.L71: +.L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addsd %xmm2, %xmm8 @@ -1766,11 +1766,11 @@ .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $4, %rax @@ -1778,7 +1778,7 @@ addq %rax, %rax ALIGN_4 - + .L82: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -1821,7 +1821,7 @@ subq $1, %rax jne .L84 ALIGN_4 - + .L85: movq C, CO1 movq A, AO @@ -1843,7 +1843,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 @@ -1865,7 +1865,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1879,7 +1879,7 @@ jle .L95 ALIGN_4 -.L91: +.L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 @@ -1989,12 +1989,12 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + addq $8 * SIZE, CO1 # coffset += 4 subq $1, I jg .L90 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -2011,7 +2011,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 movapd -16 * SIZE(BO), %xmm4 @@ -2032,7 +2032,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2046,7 +2046,7 @@ jle .L105 ALIGN_4 -.L101: +.L101: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 @@ -2128,7 +2128,7 @@ addq $4 * SIZE, CO1 ALIGN_4 - + .L110: testq $1, M jle .L999 @@ -2144,7 +2144,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 movsd -16 * SIZE(BO), %xmm4 @@ -2165,7 +2165,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2179,7 +2179,7 @@ jle .L115 ALIGN_4 -.L111: +.L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm0, %xmm8 diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S index 7dd2c9155..b000dc567 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S +++ b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -122,7 +122,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -168,7 +168,7 @@ movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I - + subq $-16 * SIZE, A subq $-17 * SIZE, B @@ -217,7 +217,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 @@ -251,7 +251,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -266,7 +266,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 @@ -643,7 +643,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 @@ -664,7 +664,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -679,7 +679,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: addpd %xmm3, %xmm11 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -871,7 +871,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -891,7 +891,7 @@ leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 @@ -907,7 +907,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -922,7 +922,7 @@ jle .L35 ALIGN_4 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -1043,7 +1043,7 @@ movhps %xmm0, 1 * SIZE(CO1, LDC, 2) movlps %xmm1, 0 * SIZE(CO2, LDC, 2) movhps %xmm1, 1 * SIZE(CO2, LDC, 2) - ALIGN_4 + ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1072,7 +1072,7 @@ movq OFFSET, %rax movq %rax, KK #endif - + movq K, %rax salq $BASE_SHIFT + 1, %rax leaq (B, %rax), BB @@ -1096,7 +1096,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif PREFETCHB -16 * SIZE(BB) subq $-4 * SIZE, BB @@ -1117,7 +1117,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1132,7 +1132,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 @@ -1336,7 +1336,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1356,7 +1356,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1370,7 +1370,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1385,7 +1385,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -1508,7 +1508,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -1528,7 +1528,7 @@ leaq (, %rax, SIZE), %rax addq %rax, AO leaq (BO, %rax, 2), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 @@ -1541,7 +1541,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1556,7 +1556,7 @@ jle .L75 ALIGN_4 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) shufps $0x44, %xmm0, %xmm0 @@ -1638,7 +1638,7 @@ movhps %xmm0, 1 * SIZE(CO1) movlps %xmm1, 0 * SIZE(CO2) movhps %xmm1, 1 * SIZE(CO2) - ALIGN_4 + ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1681,7 +1681,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO addq %rax, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -1698,7 +1698,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1713,7 +1713,7 @@ jle .L95 ALIGN_4 -.L92: +.L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -1848,7 +1848,7 @@ decq I BRANCH jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -1868,7 +1868,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO addq %rax, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1880,7 +1880,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1895,7 +1895,7 @@ jle .L105 ALIGN_4 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm2, %xmm3 @@ -1984,13 +1984,13 @@ movhps %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L110: testq $1, M BRANCH jle .L999 - + #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -2003,7 +2003,7 @@ leaq (, %rax, SIZE), %rax addq %rax, AO addq %rax, BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 movsd -17 * SIZE(BO), %xmm2 @@ -2016,7 +2016,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2031,7 +2031,7 @@ jle .L115 ALIGN_4 -.L112: +.L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) mulsd %xmm0, %xmm2 @@ -2100,7 +2100,7 @@ movlps %xmm0, 0 * SIZE(CO1) movhps %xmm0, 1 * SIZE(CO1) - ALIGN_4 + ALIGN_4 .L999: movq 0(%rsp), %rbx diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S index 3b313b381..cb90b4c8f 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S +++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -329,14 +329,14 @@ movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addpd %xmm6, %xmm15 ;\ movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 -#endif - +#endif + #if defined(OS_LINUX) && defined(CORE_BARCELONA) .align 32768 #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -395,7 +395,7 @@ movsd %xmm0, 0 + ALPHA movsd %xmm1, 8 + ALPHA - + salq $ZBASE_SHIFT, LDC #ifdef TRMMKERNEL @@ -403,7 +403,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq N, J sarq $2, J # j = (n >> 2) @@ -415,17 +415,17 @@ leaq 16 * SIZE + BUFFER, BO movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $2, %rax jle .L03 ALIGN_3 - + #define RPREFETCHSIZE (8 * 7 + 4) #define WPREFETCHSIZE (8 * 8 + 4) @@ -534,7 +534,7 @@ subq $1, %rax jne .L04 ALIGN_3 - + .L10: movq A, AO # aoffset = a leaq (RPREFETCHSIZE + 0) * SIZE(B), BB @@ -560,7 +560,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -589,7 +589,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -916,7 +916,7 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 @@ -951,7 +951,7 @@ movhpd %xmm2, 5 * SIZE(CO2) movsd %xmm3, 6 * SIZE(CO2) movhpd %xmm3, 7 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 movsd 2 * SIZE(CO1, LDC, 2), %xmm1 @@ -986,7 +986,7 @@ movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) movsd %xmm3, 6 * SIZE(CO1, LDC, 2) movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 @@ -1027,7 +1027,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_3 + ALIGN_3 .L20: testq $3, M @@ -1049,7 +1049,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1068,7 +1068,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1281,7 +1281,7 @@ movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) movsd %xmm1, 2 * SIZE(CO1, LDC, 2) movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 movsd 2 * SIZE(CO2, LDC, 2), %xmm1 @@ -1302,7 +1302,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_3 + ALIGN_3 .L30: testq $1, M @@ -1321,7 +1321,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1340,7 +1340,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1528,7 +1528,7 @@ movsd %xmm0, 0 * SIZE(CO1, LDC, 2) movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 @@ -1538,8 +1538,8 @@ movsd %xmm0, 0 * SIZE(CO2, LDC, 2) movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) - ALIGN_3 - + ALIGN_3 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1561,17 +1561,17 @@ .L41: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $2, %rax jle .L43 ALIGN_3 - + .L42: PREFETCH 56 * SIZE(B) @@ -1629,7 +1629,7 @@ decq %rax jne .L44 ALIGN_3 - + .L50: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1652,7 +1652,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1676,7 +1676,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1872,7 +1872,7 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm0 movhpd 1 * SIZE(CO2), %xmm0 movsd 2 * SIZE(CO2), %xmm1 @@ -1912,7 +1912,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L51 - ALIGN_3 + ALIGN_3 .L60: testq $2, M @@ -1931,7 +1931,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1950,7 +1950,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2097,7 +2097,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_3 + ALIGN_3 .L70: testq $1, M @@ -2116,7 +2116,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2135,7 +2135,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2262,8 +2262,8 @@ movsd %xmm0, 0 * SIZE(CO2) movhpd %xmm0, 1 * SIZE(CO2) - ALIGN_3 - + ALIGN_3 + .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2279,17 +2279,17 @@ .L81: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $3, %rax jle .L83 ALIGN_3 - + .L82: PREFETCH 56 * SIZE(B) @@ -2344,7 +2344,7 @@ decq %rax jne .L84 ALIGN_3 - + .L90: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2366,7 +2366,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2387,7 +2387,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2525,11 +2525,11 @@ movhpd %xmm2, 5 * SIZE(CO1) movsd %xmm3, 6 * SIZE(CO1) movhpd %xmm3, 7 * SIZE(CO1) - + addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 - ALIGN_3 + ALIGN_3 .L100: testq $2, M @@ -2548,7 +2548,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2564,7 +2564,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2662,7 +2662,7 @@ movhpd %xmm1, 3 * SIZE(CO1) addq $4 * SIZE, CO1 - ALIGN_3 + ALIGN_3 .L110: testq $1, M @@ -2681,7 +2681,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movsd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2697,7 +2697,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2786,7 +2786,7 @@ movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) ALIGN_3 - + .L999: movq %rbx, %rsp diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S index 73f5fcef5..ce46dbdf0 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S +++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -338,7 +338,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -377,19 +377,19 @@ movsd %xmm0, ALPHA_R movsd %xmm1, ALPHA_I - + salq $ZBASE_SHIFT, LDC movq N, J sarq $2, J # j = (n >> 2) jle .L40 ALIGN_4 - + .L10: #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -418,7 +418,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -448,7 +448,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -465,7 +465,7 @@ NOBRANCH je .L15 -.L1X: +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -946,7 +946,7 @@ movhpd %xmm10, 5 * SIZE(CO1) movsd %xmm11, 6 * SIZE(CO1) movhpd %xmm11, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 movsd 2 * SIZE(CO2), %xmm9 @@ -1016,7 +1016,7 @@ movhpd %xmm10, 5 * SIZE(CO1, LDC, 2) movsd %xmm11, 6 * SIZE(CO1, LDC, 2) movhpd %xmm11, 7 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 movsd 2 * SIZE(CO2, LDC, 2), %xmm9 @@ -1077,7 +1077,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1093,7 +1093,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1308,7 +1308,7 @@ movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) movsd %xmm9, 2 * SIZE(CO1, LDC, 2) movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 movsd 2 * SIZE(CO2, LDC, 2), %xmm9 @@ -1329,7 +1329,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1347,7 +1347,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 4), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1363,7 +1363,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1503,7 +1503,7 @@ movsd %xmm8, 0 * SIZE(CO1, LDC, 2) movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) - + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 @@ -1514,8 +1514,8 @@ movsd %xmm8, 0 * SIZE(CO2, LDC, 2) movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) - ALIGN_4 - + ALIGN_4 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1535,7 +1535,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1557,7 +1557,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1581,7 +1581,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1787,7 +1787,7 @@ movhpd %xmm10, 5 * SIZE(CO1) movsd %xmm11, 6 * SIZE(CO1) movhpd %xmm11, 7 * SIZE(CO1) - + movsd 0 * SIZE(CO2), %xmm8 movhpd 1 * SIZE(CO2), %xmm8 movsd 2 * SIZE(CO2), %xmm9 @@ -1828,7 +1828,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1846,7 +1846,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1862,7 +1862,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2011,7 +2011,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L70: testq $1, M @@ -2029,7 +2029,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 2), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2045,7 +2045,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2144,8 +2144,8 @@ movsd %xmm8, 0 * SIZE(CO2) movhpd %xmm8, 1 * SIZE(CO2) - ALIGN_4 - + ALIGN_4 + .L79: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2162,7 +2162,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 movq A, AO @@ -2183,7 +2183,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 1), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2205,7 +2205,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2342,11 +2342,11 @@ movhpd %xmm10, 5 * SIZE(CO1) movsd %xmm11, 6 * SIZE(CO1) movhpd %xmm11, 7 * SIZE(CO1) - + addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $2, M @@ -2364,7 +2364,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 1), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2380,7 +2380,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2478,7 +2478,7 @@ movhpd %xmm9, 3 * SIZE(CO1) addq $4 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L110: testq $1, M @@ -2496,7 +2496,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (B, %rax, 1), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2517,7 +2517,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2591,8 +2591,8 @@ movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S index 92be8fc25..8da31d22d 100644 --- a/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S +++ b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -103,7 +103,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -151,7 +151,7 @@ movlps %xmm0, ALPHA_R movlps %xmm0, ALPHA_I - + subq $-32 * SIZE, A subq $-32 * SIZE, B @@ -200,7 +200,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif prefetcht0 -32 * SIZE(BB) subq $-16 * SIZE, BB @@ -236,7 +236,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -251,7 +251,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm12 @@ -608,7 +608,7 @@ decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -627,7 +627,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -646,7 +646,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -661,7 +661,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -862,7 +862,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -881,7 +881,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -894,7 +894,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -909,7 +909,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -1023,8 +1023,8 @@ movhps %xmm2, (CO2, LDC) movlps %xmm3, (CO2, LDC, 2) movhps %xmm3, (CO2, %rax) - ALIGN_4 - + ALIGN_4 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $8, KK @@ -1071,7 +1071,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -1093,7 +1093,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1108,7 +1108,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -1309,7 +1309,7 @@ decq I BRANCH jg .L41 - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -1328,7 +1328,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1342,7 +1342,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1357,7 +1357,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: addps %xmm1, %xmm8 pshufd $0x50, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1474,7 +1474,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1493,7 +1493,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -1505,7 +1505,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1520,7 +1520,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movaps -32 * SIZE(BO), %xmm2 @@ -1601,8 +1601,8 @@ movhps %xmm0, (CO1, LDC) movlps %xmm1, (CO2) movhps %xmm1, (CO2, LDC) - ALIGN_4 - + ALIGN_4 + .L69: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK @@ -1645,7 +1645,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -1662,7 +1662,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1677,7 +1677,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -1798,7 +1798,7 @@ decq I BRANCH jg .L71 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -1817,7 +1817,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1831,7 +1831,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1846,7 +1846,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: addps %xmm1, %xmm8 movsd -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -1931,7 +1931,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -1950,7 +1950,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm2, %xmm2 movsd -32 * SIZE(AO), %xmm0 @@ -1962,7 +1962,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1977,7 +1977,7 @@ jle .L95 ALIGN_3 -.L92: +.L92: pshufd $0x00, %xmm0, %xmm1 addps %xmm2, %xmm8 movsd -32 * SIZE(BO), %xmm2 @@ -2050,8 +2050,8 @@ movlps %xmm0, (CO1) movhps %xmm0, (CO2) - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK @@ -2093,7 +2093,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -2106,7 +2106,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2121,7 +2121,7 @@ jle .L105 ALIGN_3 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm1, %xmm8 @@ -2208,7 +2208,7 @@ decq I BRANCH jg .L101 - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -2227,7 +2227,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2238,7 +2238,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2253,7 +2253,7 @@ jle .L115 ALIGN_3 -.L112: +.L112: addps %xmm1, %xmm8 movss -32 * SIZE(BO), %xmm1 unpcklps %xmm1, %xmm1 @@ -2329,7 +2329,7 @@ movhps %xmm0, 2 * SIZE(CO1) addq $4 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L120: testq $1, M @@ -2348,7 +2348,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm2, %xmm2 movss -32 * SIZE(AO), %xmm0 @@ -2359,7 +2359,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2374,7 +2374,7 @@ jle .L125 ALIGN_3 -.L122: +.L122: addss %xmm2, %xmm8 movss -32 * SIZE(BO), %xmm2 mulss %xmm0, %xmm2 @@ -2441,8 +2441,8 @@ addps %xmm4, %xmm0 movlps %xmm0, (CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S index 80c85244a..3dbc0dd8c 100644 --- a/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S +++ b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,14 +49,14 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define CO2 %r12 #define BB %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -299,7 +299,7 @@ movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -407,7 +407,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -464,13 +464,13 @@ movss %xmm1, 4 + ALPHA movss %xmm0, 8 + ALPHA movss %xmm1, 12 + ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-32 * SIZE, A @@ -485,16 +485,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 - + .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -581,7 +581,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -606,7 +606,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif prefetch 0 * SIZE(BB) prefetch 16 * SIZE(BB) @@ -638,7 +638,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -866,7 +866,7 @@ movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) - + movups 0 * SIZE(CO1, LDC, 2), %xmm0 movups 4 * SIZE(CO1, LDC, 2), %xmm1 movups 8 * SIZE(CO1, LDC, 2), %xmm2 @@ -929,7 +929,7 @@ addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -946,7 +946,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -966,7 +966,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1196,7 +1196,7 @@ addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1213,7 +1213,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -1233,7 +1233,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1439,7 +1439,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1457,7 +1457,7 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -1477,7 +1477,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1676,8 +1676,8 @@ addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1694,16 +1694,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 - + .L52: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -1766,7 +1766,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1789,7 +1789,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -1814,7 +1814,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2039,7 +2039,7 @@ addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -2057,7 +2057,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2077,7 +2077,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2219,7 +2219,7 @@ addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2236,7 +2236,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -2256,7 +2256,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2395,7 +2395,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -2412,7 +2412,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -2432,7 +2432,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2566,8 +2566,8 @@ addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO2) - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2584,16 +2584,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 - + .L102: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -2650,7 +2650,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2672,7 +2672,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2696,7 +2696,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2836,7 +2836,7 @@ addq $16 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -2853,7 +2853,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2871,7 +2871,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2968,7 +2968,7 @@ movhps %xmm0, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -2985,7 +2985,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -3003,7 +3003,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3101,7 +3101,7 @@ movhps %xmm2, 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -3118,7 +3118,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -3136,7 +3136,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3221,8 +3221,8 @@ addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq %rbx, %rsp movq 0(%rsp), %rbx diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_core2.S b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S index 2ddbb5cfb..0b97d85a9 100644 --- a/kernel/x86_64/zgemm3m_kernel_8x4_core2.S +++ b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -95,7 +95,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -156,7 +156,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq OLD_M, M @@ -173,16 +173,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq 32 * SIZE + BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L05 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movaps -32 * SIZE(B), %xmm3 @@ -261,7 +261,7 @@ subq $1, %rax jne .L06 ALIGN_4 - + .L10: movq B, BB @@ -286,7 +286,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 movaps -32 * SIZE(AO), %xmm0 @@ -320,7 +320,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -334,7 +334,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 @@ -588,7 +588,7 @@ movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 @@ -654,12 +654,12 @@ movhps %xmm5, 10 * SIZE(CO2, LDC, 2) movlps %xmm15, 12 * SIZE(CO2, LDC, 2) movhps %xmm15, 14 * SIZE(CO2, LDC, 2) - + addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 subq $1, I jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -676,7 +676,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -688,7 +688,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -702,7 +702,7 @@ jle .L25 ALIGN_4 -.L21: +.L21: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -846,7 +846,7 @@ movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 @@ -886,7 +886,7 @@ addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 ALIGN_4 - + .L30: testq $2, M jle .L40 @@ -902,7 +902,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -919,7 +919,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -933,7 +933,7 @@ jle .L35 ALIGN_4 -.L31: +.L31: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 @@ -1061,7 +1061,7 @@ movlps %xmm4, 0 * SIZE(CO2) movhps %xmm4, 2 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 @@ -1102,7 +1102,7 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1119,7 +1119,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1133,7 +1133,7 @@ jle .L45 ALIGN_4 -.L41: +.L41: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 @@ -1257,7 +1257,7 @@ addps %xmm0, %xmm4 movlps %xmm4, 0 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 pshufd $0x50, %xmm10, %xmm4 @@ -1293,11 +1293,11 @@ .L51: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $3, %rax @@ -1305,7 +1305,7 @@ addq %rax, %rax ALIGN_4 - + .L52: movaps -32 * SIZE(B), %xmm3 movaps -28 * SIZE(B), %xmm7 @@ -1361,7 +1361,7 @@ subq $1, %rax jne .L54 ALIGN_4 - + .L55: movq C, CO1 leaq (C, LDC, 1), CO2 @@ -1384,7 +1384,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1399,7 +1399,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1413,7 +1413,7 @@ jle .L65 ALIGN_4 -.L61: +.L61: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -1597,12 +1597,12 @@ movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) - + addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 subq $1, I jg .L60 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -1619,7 +1619,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1631,7 +1631,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1645,7 +1645,7 @@ jle .L75 ALIGN_4 -.L71: +.L71: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -1760,7 +1760,7 @@ addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 ALIGN_4 - + .L80: testq $2, M jle .L90 @@ -1776,7 +1776,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1788,7 +1788,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1802,7 +1802,7 @@ jle .L85 ALIGN_4 -.L81: +.L81: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 @@ -1919,7 +1919,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1931,7 +1931,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1945,7 +1945,7 @@ jle .L95 ALIGN_4 -.L91: +.L91: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) @@ -2059,11 +2059,11 @@ .L101: /* Copying to Sub Buffer */ leaq BUFFER, BO - + #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq K, %rax sarq $4, %rax @@ -2071,7 +2071,7 @@ addq %rax, %rax ALIGN_4 - + .L102: movss -32 * SIZE(B), %xmm0 movss -31 * SIZE(B), %xmm1 @@ -2125,7 +2125,7 @@ subq $1, %rax jne .L104 ALIGN_4 - + .L105: movq C, CO1 movq A, AO @@ -2147,7 +2147,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2161,7 +2161,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2175,7 +2175,7 @@ jle .L115 ALIGN_4 -.L111: +.L111: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -2294,7 +2294,7 @@ addq $16 * SIZE, CO1 subq $1, I jg .L110 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -2311,7 +2311,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2323,7 +2323,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2337,7 +2337,7 @@ jle .L125 ALIGN_4 -.L121: +.L121: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movaps -32 * SIZE(AO), %xmm0 @@ -2419,7 +2419,7 @@ addq $8 * SIZE, CO1 ALIGN_4 - + .L130: testq $2, M jle .L140 @@ -2435,7 +2435,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2447,7 +2447,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2461,7 +2461,7 @@ jle .L135 ALIGN_4 -.L131: +.L131: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movsd -32 * SIZE(AO), %xmm0 @@ -2549,7 +2549,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2561,7 +2561,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2575,7 +2575,7 @@ jle .L145 ALIGN_4 -.L141: +.L141: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movss -32 * SIZE(AO), %xmm0 diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S index bf2d96e52..1255c2cd5 100644 --- a/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S +++ b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -103,7 +103,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -150,7 +150,7 @@ movlps %xmm0, ALPHA_R movlps %xmm0, ALPHA_I - + subq $-32 * SIZE, A subq $-32 * SIZE, B @@ -199,7 +199,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 @@ -234,7 +234,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -249,7 +249,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH -32 * SIZE(PREA) addps %xmm6, %xmm10 addps %xmm3, %xmm14 @@ -654,7 +654,7 @@ movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 @@ -720,13 +720,13 @@ movhps %xmm5, 10 * SIZE(CO2, LDC, 2) movlps %xmm15, 12 * SIZE(CO2, LDC, 2) movhps %xmm15, 14 * SIZE(CO2, LDC, 2) - + addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 decq I BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -745,7 +745,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -765,7 +765,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -780,7 +780,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: addps %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x39, %xmm2, %xmm7 @@ -939,7 +939,7 @@ movhps %xmm4, 2 * SIZE(CO2) movlps %xmm9, 4 * SIZE(CO2) movhps %xmm9, 6 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 @@ -978,7 +978,7 @@ addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -997,7 +997,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -1016,7 +1016,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1031,7 +1031,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x44, %xmm0, %xmm1 @@ -1157,7 +1157,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1176,7 +1176,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1188,7 +1188,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1203,7 +1203,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 @@ -1338,7 +1338,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 2), BO -#endif +#endif prefetcht2 -32 * SIZE(BB) subq $-8 * SIZE, BB @@ -1365,7 +1365,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1380,7 +1380,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1569,13 +1569,13 @@ movhps %xmm5, 10 * SIZE(CO2) movlps %xmm11, 12 * SIZE(CO2) movhps %xmm11, 14 * SIZE(CO2) - + addq $16 * SIZE, CO1 addq $16 * SIZE, CO2 decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $4, M @@ -1594,7 +1594,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -1611,7 +1611,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1626,7 +1626,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1748,7 +1748,7 @@ addq $8 * SIZE, CO1 addq $8 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L70: testq $2, M @@ -1767,7 +1767,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -1780,7 +1780,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1795,7 +1795,7 @@ jle .L75 ALIGN_3 -.L72: +.L72: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1883,7 +1883,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L80: testq $1, M @@ -1902,7 +1902,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movsd -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1914,7 +1914,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1929,7 +1929,7 @@ jle .L85 ALIGN_3 -.L82: +.L82: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm0, %xmm1 @@ -2048,7 +2048,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 8), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2064,7 +2064,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2079,7 +2079,7 @@ jle .L95 ALIGN_3 -.L92: +.L92: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 @@ -2198,7 +2198,7 @@ decq I BRANCH jg .L91 - ALIGN_4 + ALIGN_4 .L100: testq $4, M @@ -2217,7 +2217,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2229,7 +2229,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2244,7 +2244,7 @@ jle .L105 ALIGN_3 -.L102: +.L102: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 @@ -2328,7 +2328,7 @@ movhps %xmm8, 6 * SIZE(CO1) addq $8 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L110: testq $2, M @@ -2347,7 +2347,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 xorps %xmm3, %xmm3 @@ -2360,7 +2360,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2375,7 +2375,7 @@ jle .L115 ALIGN_3 -.L112: +.L112: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x00, %xmm2, %xmm3 @@ -2449,7 +2449,7 @@ movhps %xmm4, 2 * SIZE(CO1) addq $4 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L120: testq $1, M @@ -2468,7 +2468,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif movss -32 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -2480,7 +2480,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2495,7 +2495,7 @@ jle .L125 ALIGN_3 -.L122: +.L122: mulss %xmm0, %xmm2 movss -31 * SIZE(AO), %xmm0 addss %xmm2, %xmm8 diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S index 6bd914802..a3c69163e 100644 --- a/kernel/x86_64/zgemm3m_kernel_8x4_sse.S +++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi @@ -56,7 +56,7 @@ #define CO1 %r15 #define CO2 %rbp #define BB %r12 - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -276,7 +276,7 @@ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ - movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ @@ -342,7 +342,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -388,7 +388,7 @@ #endif EMMS - + movq %rsp, %rbx # save old stack subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-1024, %rsp # align stack @@ -402,13 +402,13 @@ movss %xmm1, 4 + ALPHA movss %xmm0, 8 + ALPHA movss %xmm1, 12 + ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-32 * SIZE, A @@ -423,18 +423,18 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 - + .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) @@ -457,7 +457,7 @@ punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 - + movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) @@ -517,7 +517,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -545,7 +545,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 @@ -574,7 +574,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -925,7 +925,7 @@ movhps %xmm5, 10 * SIZE(CO2) movlps %xmm13, 12 * SIZE(CO2) movhps %xmm13, 14 * SIZE(CO2) - + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 movhps 2 * SIZE(CO1, LDC, 2), %xmm0 movsd 4 * SIZE(CO1, LDC, 2), %xmm1 @@ -996,7 +996,7 @@ addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -1013,7 +1013,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -1033,7 +1033,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1275,7 +1275,7 @@ addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1292,7 +1292,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -1312,7 +1312,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1524,7 +1524,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1542,7 +1542,7 @@ leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -1562,7 +1562,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1763,8 +1763,8 @@ addps %xmm8, %xmm4 movlps %xmm4, 0 * SIZE(CO2, LDC, 2) - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1781,16 +1781,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L53 ALIGN_4 - + .L52: #if defined(PENTIUM4) || defined(GENERIC) movss 0 * SIZE(B), %xmm0 @@ -1846,7 +1846,7 @@ punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 - + movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) @@ -1909,7 +1909,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1932,7 +1932,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -1957,7 +1957,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2197,7 +2197,7 @@ addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -2215,7 +2215,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2235,7 +2235,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2384,7 +2384,7 @@ addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2401,7 +2401,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -2421,7 +2421,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2562,7 +2562,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -2579,7 +2579,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -2599,7 +2599,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2733,8 +2733,8 @@ addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO2) - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2751,16 +2751,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 - + .L102: #if defined(PENTIUM4) || defined(GENERIC) @@ -2817,7 +2817,7 @@ punpckldq %mm5, %mm5 punpckldq %mm6, %mm6 punpckldq %mm7, %mm7 - + movq %mm0, 0 * SIZE(BO) movq %mm0, 2 * SIZE(BO) movq %mm1, 4 * SIZE(BO) @@ -2869,7 +2869,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2891,7 +2891,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -2915,7 +2915,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3068,7 +3068,7 @@ addq $16 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -3085,7 +3085,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -16 * SIZE(AO), %xmm10 @@ -3103,7 +3103,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3207,7 +3207,7 @@ movhps %xmm0, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -3224,7 +3224,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm8 movaps -24 * SIZE(AO), %xmm10 @@ -3242,7 +3242,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3343,7 +3343,7 @@ movhps %xmm2, 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -3360,7 +3360,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss -32 * SIZE(AO), %xmm8 movss -28 * SIZE(AO), %xmm10 @@ -3378,7 +3378,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3465,8 +3465,8 @@ addps %xmm8, %xmm2 movlps %xmm2, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq %rbx, %rsp EMMS diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S index 67537a702..adf0a5331 100644 --- a/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S +++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -54,7 +54,7 @@ #define CO1 %r14 #define CO2 %r15 #define BB %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -332,7 +332,7 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -382,18 +382,18 @@ andq $-1024, %rsp # align stack STACK_TOUCHING - + movss %xmm0, 0 + ALPHA movss %xmm1, 4 + ALPHA movss %xmm0, 8 + ALPHA movss %xmm1, 12 + ALPHA - + #ifdef TRMMKERNEL movsd %xmm4, OFFSET movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif salq $ZBASE_SHIFT, LDC @@ -406,16 +406,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 @@ -464,7 +464,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -493,7 +493,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -524,7 +524,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -540,7 +540,7 @@ salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1 (64 * 0) KERNEL2 (64 * 0) KERNEL3 (64 * 0) @@ -927,7 +927,7 @@ addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $4, M @@ -944,7 +944,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -964,7 +964,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1203,7 +1203,7 @@ addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L30: testq $2, M @@ -1220,7 +1220,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 @@ -1237,7 +1237,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1405,7 +1405,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1422,7 +1422,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -1437,7 +1437,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1555,8 +1555,8 @@ movhps %xmm12, 0 * SIZE(CO2) movlps %xmm0, 0 * SIZE(CO1, LDC, 2) movhps %xmm0, 0 * SIZE(CO2, LDC, 2) - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addl $4, KK @@ -1573,16 +1573,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L53 ALIGN_4 - + .L52: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 @@ -1628,7 +1628,7 @@ decq %rax jne .L54 ALIGN_4 - + .L60: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -1651,7 +1651,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1673,7 +1673,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1914,7 +1914,7 @@ addq $16 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L61 - ALIGN_4 + ALIGN_4 .L70: testq $4, M @@ -1931,7 +1931,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movsldup 0 * SIZE(BO), %xmm9 @@ -1948,7 +1948,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2098,7 +2098,7 @@ addq $8 * SIZE, CO1 # coffset += 4 addq $8 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L80: testq $2, M @@ -2115,7 +2115,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 movddup 8 * SIZE(AO), %xmm10 @@ -2130,7 +2130,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2242,7 +2242,7 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L90: testq $1, M @@ -2259,7 +2259,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -2274,7 +2274,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2376,8 +2376,8 @@ movlps %xmm12, 0 * SIZE(CO1) movhps %xmm12, 0 * SIZE(CO2) - ALIGN_4 - + ALIGN_4 + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -2393,16 +2393,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L103 ALIGN_4 - + .L102: movss 0 * SIZE(B), %xmm0 @@ -2455,7 +2455,7 @@ decq %rax jne .L104 ALIGN_4 - + .L110: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -2477,7 +2477,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -2498,7 +2498,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2654,7 +2654,7 @@ addq $16 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L111 - ALIGN_4 + ALIGN_4 .L120: testq $4, M @@ -2672,7 +2672,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -2693,7 +2693,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2795,7 +2795,7 @@ movhps %xmm0, 6 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L130: testq $2, M @@ -2812,7 +2812,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 0 * SIZE(BO), %xmm9 @@ -2829,7 +2829,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2923,7 +2923,7 @@ movhps %xmm12, 2 * SIZE(CO1) addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 + ALIGN_4 .L140: testq $1, M @@ -2940,7 +2940,7 @@ leaq (, %rax, 4), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movss 0 * SIZE(AO), %xmm8 movss 4 * SIZE(AO), %xmm10 @@ -2957,7 +2957,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -3043,8 +3043,8 @@ addps %xmm8, %xmm12 movlps %xmm12, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq %rbx, %rsp movq 0(%rsp), %rbx diff --git a/kernel/x86_64/zgemm_beta.S b/kernel/x86_64/zgemm_beta.S index ffc775b03..1612d92f1 100644 --- a/kernel/x86_64/zgemm_beta.S +++ b/kernel/x86_64/zgemm_beta.S @@ -71,7 +71,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI subq $STACKSIZE, %rsp diff --git a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S index 4ddfc488b..0d6acf330 100644 --- a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S +++ b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %rbp - + #define I %r11 #define AO %rdi #define BO %rsi @@ -114,7 +114,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -160,7 +160,7 @@ movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I - + subq $-16 * SIZE, A subq $-16 * SIZE, B @@ -174,7 +174,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif testq M, M @@ -219,7 +219,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif PADDING xorps %xmm1, %xmm1 @@ -248,7 +248,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -263,7 +263,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 @@ -635,7 +635,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -655,7 +655,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -670,7 +670,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -896,7 +896,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -913,7 +913,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -928,7 +928,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -1062,7 +1062,7 @@ decq I BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L999: movq 0(%rsp), %rbx diff --git a/kernel/x86_64/zgemm_kernel_2x1_atom.S b/kernel/x86_64/zgemm_kernel_2x1_atom.S index be42e036d..d9f320a8a 100644 --- a/kernel/x86_64/zgemm_kernel_2x1_atom.S +++ b/kernel/x86_64/zgemm_kernel_2x1_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -54,7 +54,7 @@ #define BO %r14 #define CO1 %r15 #define BB %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 128 @@ -113,9 +113,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -165,7 +165,7 @@ movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif salq $ZBASE_SHIFT, LDC @@ -179,7 +179,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 addq LDC, C @@ -206,7 +206,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -237,7 +237,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -261,7 +261,7 @@ ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + ADDSD4 %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 @@ -514,7 +514,7 @@ addq $4 * SIZE, CO1 decq I jg .L10 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -530,7 +530,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 @@ -553,7 +553,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -729,7 +729,7 @@ addq $1, KK #endif ALIGN_4 - + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $1, KK @@ -739,7 +739,7 @@ decq J # j -- jg .L01 ALIGN_4 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm_kernel_2x2_barcelona.S b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S index 31fad2b8c..70e8f6039 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_barcelona.S +++ b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -56,7 +56,7 @@ #define CO2 %rbx #define BB %r12 #define J %r15 - + #ifndef WINDOWS_ABI #define STACKSIZE 96 @@ -89,7 +89,7 @@ #define movlpd movsd #define movapd movups #define movupd movups - + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1 addpd #define ADD2 addpd @@ -420,9 +420,9 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -475,14 +475,14 @@ movlpd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-16 * SIZE, A subq $-16 * SIZE, B salq $ZBASE_SHIFT, LDC - + movq N, J sarq $1, J jle .L100 @@ -492,7 +492,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -519,7 +519,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movddup -16 * SIZE(BO), %xmm1 @@ -546,7 +546,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -808,7 +808,7 @@ addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -824,7 +824,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -841,7 +841,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1013,7 +1013,7 @@ addq $1, KK #endif ALIGN_4 - + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -1033,7 +1033,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1054,7 +1054,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup -16 * SIZE(BO), %xmm1 movddup -15 * SIZE(BO), %xmm5 @@ -1073,7 +1073,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1244,7 +1244,7 @@ addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -1260,7 +1260,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1276,7 +1276,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1393,7 +1393,7 @@ movlpd %xmm8, (CO1) movhpd %xmm8, 1 * SIZE(CO1) ALIGN_4 - + .L999: movq (%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm_kernel_2x2_core2.S b/kernel/x86_64/zgemm_kernel_2x2_core2.S index 799c15103..b74e2fe91 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_core2.S +++ b/kernel/x86_64/zgemm_kernel_2x2_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -109,7 +109,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) @@ -175,7 +175,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif salq $ZBASE_SHIFT, LDC @@ -190,11 +190,11 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq 16 * SIZE + BUFFER, BO - + movapd -16 * SIZE(B), %xmm0 movapd -8 * SIZE(B), %xmm4 @@ -202,7 +202,7 @@ sarq $2, %rax jle .L03 ALIGN_3 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) prefetcht0 (PREFETCH_R + 8) * SIZE(B) @@ -292,7 +292,7 @@ decq %rax jne .L04 ALIGN_3 - + .L05: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -317,7 +317,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -351,7 +351,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -366,7 +366,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PADDING; ADD1 %xmm2, %xmm10 movaps -15 * SIZE(BO), %xmm2 @@ -619,7 +619,7 @@ addsubpd %xmm11, %xmm10 addsubpd %xmm13, %xmm12 addsubpd %xmm15, %xmm14 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 @@ -654,7 +654,7 @@ addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -672,7 +672,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -684,7 +684,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -844,7 +844,7 @@ addsubpd %xmm9, %xmm8 addsubpd %xmm11, %xmm10 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm8 addpd %xmm1, %xmm10 @@ -868,7 +868,7 @@ addq $1, KK #endif ALIGN_4 - + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -886,7 +886,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO @@ -895,7 +895,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -940,7 +940,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -962,7 +962,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -975,7 +975,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1145,7 +1145,7 @@ addsubpd %xmm9, %xmm8 addsubpd %xmm13, %xmm12 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm0, %xmm8 addpd %xmm2, %xmm12 @@ -1172,7 +1172,7 @@ addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -1191,7 +1191,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1203,7 +1203,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1321,10 +1321,10 @@ movsd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) ALIGN_4 - + .L999: movq %r15, %rsp - + movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 diff --git a/kernel/x86_64/zgemm_kernel_2x2_penryn.S b/kernel/x86_64/zgemm_kernel_2x2_penryn.S index 751110fd1..24e1afdea 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_penryn.S +++ b/kernel/x86_64/zgemm_kernel_2x2_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -137,7 +137,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -183,7 +183,7 @@ movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I - + subq $-16 * SIZE, A subq $-17 * SIZE, B @@ -197,7 +197,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -240,7 +240,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorpd %xmm3, %xmm3 @@ -270,7 +270,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -285,7 +285,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm3, %xmm12 movaps -15 * SIZE(BO), %xmm3 @@ -580,7 +580,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -600,7 +600,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -17 * SIZE(BO), %xmm2 @@ -619,7 +619,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -634,7 +634,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: ADD1 %xmm3, %xmm12 movaps -15 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -809,7 +809,7 @@ addq $2 * SIZE, CO1 # coffset += 4 addq $2 * SIZE, CO2 # coffset += 4 - ALIGN_4 + ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -837,7 +837,7 @@ movq OFFSET, %rax movq %rax, KK #endif - + movq M, I sarq $1, I # i = (m >> 2) NOBRANCH @@ -857,7 +857,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -874,7 +874,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -889,7 +889,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 @@ -1075,7 +1075,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1095,7 +1095,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 xorps %xmm8, %xmm8 @@ -1109,7 +1109,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1124,7 +1124,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -1257,7 +1257,7 @@ addq $2 * SIZE, CO1 addq $2 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L79: #if defined(TRMMKERNEL) && !defined(LEFT) diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse2.S b/kernel/x86_64/zgemm_kernel_2x2_sse2.S index 4b83eeebd..ae7bb8faf 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_sse2.S +++ b/kernel/x86_64/zgemm_kernel_2x2_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -334,9 +334,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -385,7 +385,7 @@ andq $-4096, %rsp # align stack STACK_TOUCHING - + movq OLD_M, M movq OLD_N, N @@ -395,7 +395,7 @@ movlpd %xmm0, 0 + ALPHA_R movlpd %xmm0, 8 + ALPHA_R - + movlpd %xmm1, 8 + ALPHA_I xorpd %xmm7, %xmm1 movlpd %xmm1, 0 + ALPHA_I @@ -408,13 +408,13 @@ movlpd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif subq $-16 * SIZE, A salq $ZBASE_SHIFT, LDC - + movq N, J sarq $1, J # j = (n >> 2) jle .L100 @@ -427,7 +427,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif leaq 16 * SIZE + BUFFER, BO @@ -435,7 +435,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) @@ -539,7 +539,7 @@ decq %rax jne .L04 ALIGN_4 - + .L05: movq A, AO # aoffset = a @@ -562,7 +562,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -590,7 +590,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -928,7 +928,7 @@ pshufd $0x4e, %xmm10, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 - + mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 @@ -978,7 +978,7 @@ addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -995,7 +995,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1011,7 +1011,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1023,7 +1023,7 @@ #endif sarq $3, %rax je .L44 - ALIGN_4 + ALIGN_4 .L41: mulpd %xmm0, %xmm1 @@ -1129,7 +1129,7 @@ addq $64 * SIZE, BO decq %rax jne .L41 - ALIGN_4 + ALIGN_4 .L44: #ifndef TRMMKERNEL @@ -1192,7 +1192,7 @@ addq $ 8 * SIZE, AO addq $32 * SIZE, BO - ALIGN_4 + ALIGN_4 .L45: #ifndef TRMMKERNEL @@ -1206,7 +1206,7 @@ andq $3, %rax # if (k & 1) BRANCH jle .L47 - ALIGN_4 + ALIGN_4 .L46: mulpd %xmm0, %xmm1 @@ -1260,7 +1260,7 @@ pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 - + mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm10 @@ -1292,7 +1292,7 @@ addq $1, KK #endif ALIGN_4 - + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addl $2, KK @@ -1310,7 +1310,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO @@ -1319,7 +1319,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movlpd 0 * SIZE(B), %xmm8 movlpd 1 * SIZE(B), %xmm9 @@ -1374,7 +1374,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1396,7 +1396,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1413,7 +1413,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1548,7 +1548,7 @@ pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm12, %xmm13 - + mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 mulpd %xmm6, %xmm12 @@ -1583,7 +1583,7 @@ addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -1600,7 +1600,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movapd -16 * SIZE(AO), %xmm0 movapd -16 * SIZE(BO), %xmm1 @@ -1617,7 +1617,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1783,7 +1783,7 @@ #endif pshufd $0x4e, %xmm8, %xmm9 - + mulpd %xmm6, %xmm8 mulpd %xmm7, %xmm9 @@ -1796,11 +1796,11 @@ movlpd %xmm8, 0 * SIZE(CO1) movhpd %xmm8, 1 * SIZE(CO1) ALIGN_4 - + .L999: movq %rbx, %rsp EMMS - + movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse3.S b/kernel/x86_64/zgemm_kernel_2x2_sse3.S index afb092439..b78f1a902 100644 --- a/kernel/x86_64/zgemm_kernel_2x2_sse3.S +++ b/kernel/x86_64/zgemm_kernel_2x2_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -55,7 +55,7 @@ #define CO1 %r15 #define CO2 %rbx #define BB %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 128 @@ -351,9 +351,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -403,7 +403,7 @@ movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif salq $ZBASE_SHIFT, LDC @@ -416,7 +416,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -442,7 +442,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 4), BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -473,7 +473,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -487,8 +487,8 @@ andq $-8, %rax salq $4, %rax je .L12 - -.L1X: + +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -757,7 +757,7 @@ addsubpd %xmm3, %xmm2 addsubpd %xmm5, %xmm4 addsubpd %xmm7, %xmm6 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 @@ -792,7 +792,7 @@ addq $4 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -809,7 +809,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 4), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -825,7 +825,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1022,7 +1022,7 @@ addsubpd %xmm1, %xmm0 addsubpd %xmm3, %xmm2 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm9, %xmm2 @@ -1046,7 +1046,7 @@ addq $1, KK #endif ALIGN_4 - + .L99: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK @@ -1065,7 +1065,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1086,7 +1086,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 4), AO leaq (B, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1104,7 +1104,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1311,7 +1311,7 @@ addsubpd %xmm1, %xmm0 addsubpd %xmm5, %xmm4 - + #if! defined(TRMMKERNEL) && !defined(BETAZERO) addpd %xmm8, %xmm0 addpd %xmm10, %xmm4 @@ -1338,7 +1338,7 @@ addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -1356,7 +1356,7 @@ leaq (, %rax, SIZE), %rax leaq (AO, %rax, 2), AO leaq (B, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1372,7 +1372,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1509,7 +1509,7 @@ movsd %xmm0, 0 * SIZE(CO1) movhpd %xmm0, 1 * SIZE(CO1) ALIGN_4 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm_kernel_2x4_nehalem.S b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S index 6a16b7e13..ce6012388 100644 --- a/kernel/x86_64/zgemm_kernel_2x4_nehalem.S +++ b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %rbp - + #define I %r11 #define AO %rdi #define BO %rsi @@ -114,7 +114,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -163,7 +163,7 @@ movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I - + subq $-32 * SIZE, A subq $-32 * SIZE, B @@ -177,7 +177,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -210,7 +210,7 @@ .L11: prefetcht2 -32 * SIZE(BB) subq $-16 * SIZE, BB - + #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -223,7 +223,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 @@ -251,7 +251,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -266,7 +266,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 @@ -547,7 +547,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -566,7 +566,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -585,7 +585,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -600,7 +600,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -783,8 +783,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L29: #if defined(TRMMKERNEL) && !defined(LEFT) addq $4, KK @@ -831,7 +831,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -852,7 +852,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -867,7 +867,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -1048,7 +1048,7 @@ decq I # i -- BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1067,7 +1067,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1086,7 +1086,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1101,7 +1101,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1230,8 +1230,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L49: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK @@ -1273,7 +1273,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -1293,7 +1293,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1308,7 +1308,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -1436,7 +1436,7 @@ decq I # i -- BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1455,7 +1455,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1469,7 +1469,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1484,7 +1484,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1597,8 +1597,8 @@ #endif movsd %xmm8, (CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm_kernel_4x2_barcelona.S b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S index c59a50d05..06d0bbe14 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_barcelona.S +++ b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -55,7 +55,7 @@ #define CO1 %r15 #define CO2 %rbp #define BB %r12 - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -301,7 +301,7 @@ movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ movaps %xmm0, %xmm2 ;\ addq $16 * SIZE, %rax - + #define KERNEL_SUB1(xx) \ mulps %xmm1, %xmm0 ;\ mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ @@ -409,9 +409,9 @@ #endif PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -469,7 +469,7 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R - + movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 @@ -496,7 +496,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif salq $ZBASE_SHIFT, LDC @@ -509,19 +509,19 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm15 - + movq K, %rax sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 - + .L02: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -543,7 +543,7 @@ pshufd $0xff, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 @@ -587,7 +587,7 @@ pshufd $0xff, %xmm3, %xmm3 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 #else @@ -605,7 +605,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -630,7 +630,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif prefetch -20 * SIZE(BB) prefetch 28 * SIZE(BB) @@ -660,7 +660,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -921,7 +921,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -938,7 +938,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -16 * SIZE(AO), %xmm2 @@ -960,7 +960,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1182,8 +1182,8 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L30: testq $1, M je .L39 @@ -1199,7 +1199,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -24 * SIZE(AO), %xmm2 @@ -1219,7 +1219,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1440,7 +1440,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1461,17 +1461,17 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm15 - + movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 - + .L42: prefetch (RPREFETCHSIZE + 0) * SIZE(B) @@ -1493,7 +1493,7 @@ pshufd $0xff, %xmm7, %xmm7 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 xorps %xmm15, %xmm3 xorps %xmm15, %xmm5 @@ -1535,7 +1535,7 @@ pshufd $0x55, %xmm3, %xmm1 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm15, %xmm1 #else xorps %xmm15, %xmm0 @@ -1549,7 +1549,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1572,7 +1572,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1599,7 +1599,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1825,7 +1825,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1842,7 +1842,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1861,7 +1861,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2014,8 +2014,8 @@ #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L70: testq $1, M je .L999 @@ -2031,7 +2031,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2050,7 +2050,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2194,7 +2194,7 @@ addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) - ALIGN_4 + ALIGN_4 .L999: movq %rbx, %rsp diff --git a/kernel/x86_64/zgemm_kernel_4x2_core2.S b/kernel/x86_64/zgemm_kernel_4x2_core2.S index 1b5d9a03f..bb4584a6b 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_core2.S +++ b/kernel/x86_64/zgemm_kernel_4x2_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -100,9 +100,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -156,7 +156,7 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R - + movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 @@ -171,7 +171,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif movq OLD_M, M @@ -187,18 +187,18 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq 32 * SIZE + BUFFER, BO - + movaps -32 * SIZE(B), %xmm3 movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) movaps -28 * SIZE(B), %xmm7 @@ -279,7 +279,7 @@ subq $1, %rax jne .L04 ALIGN_4 - + .L10: leaq (PREFETCH_R + 0) * SIZE(B), BB @@ -304,7 +304,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -28 * SIZE(AO), %xmm1 @@ -337,7 +337,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -351,7 +351,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: addps %xmm2, %xmm10 movaps -32 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 @@ -417,7 +417,7 @@ addps %xmm2, %xmm10 movaps 0 * SIZE(BO), %xmm2 addps %xmm3, %xmm14 - PADDING; + PADDING; movaps %xmm6, %xmm3 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) mulps %xmm0, %xmm6 @@ -647,7 +647,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -664,7 +664,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -676,7 +676,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -867,8 +867,8 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L30: testq $1, M je .L39 @@ -884,7 +884,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -896,7 +896,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1080,7 +1080,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1101,16 +1101,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movss -32 * SIZE(B), %xmm8 movss -31 * SIZE(B), %xmm9 @@ -1168,7 +1168,7 @@ subq $1, %rax jne .L44 ALIGN_4 - + .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1190,7 +1190,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 prefetcht0 3 * SIZE(CO1) @@ -1203,7 +1203,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1405,7 +1405,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1422,7 +1422,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1434,7 +1434,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1566,8 +1566,8 @@ addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L70: testq $1, M je .L999 @@ -1583,7 +1583,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1595,7 +1595,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1711,7 +1711,7 @@ #endif movsd %xmm8, 0 * SIZE(CO1) - ALIGN_4 + ALIGN_4 .L999: movq %r15, %rsp diff --git a/kernel/x86_64/zgemm_kernel_4x2_penryn.S b/kernel/x86_64/zgemm_kernel_4x2_penryn.S index 241148db8..9aa852ae6 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_penryn.S +++ b/kernel/x86_64/zgemm_kernel_4x2_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -114,7 +114,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -164,7 +164,7 @@ movlps %xmm0, ALPHA_R movlps %xmm1, ALPHA_I - + subq $-32 * SIZE, A subq $-32 * SIZE, B @@ -178,7 +178,7 @@ movq %r11, OFFSET #ifndef LEFT negq %r11 -#endif +#endif movq %r11, KK #endif @@ -221,7 +221,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 @@ -248,13 +248,13 @@ subq $-24 * SIZE, BB leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA - + #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -269,7 +269,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH -32 * SIZE(PREA) ADD1 %xmm6, %xmm10 ADD1 %xmm3, %xmm14 @@ -687,7 +687,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -706,7 +706,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -722,7 +722,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -737,7 +737,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: ADD1 %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0xb1, %xmm2, %xmm7 @@ -917,7 +917,7 @@ addq $4 * SIZE, CO1 addq $4 * SIZE, CO2 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -936,7 +936,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movsd -32 * SIZE(AO), %xmm0 pxor %xmm4, %xmm4 @@ -952,7 +952,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -967,7 +967,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: ADD1 %xmm6, %xmm10 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0xb1, %xmm2, %xmm7 @@ -1140,8 +1140,8 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 - + ALIGN_4 + .L39: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK @@ -1187,7 +1187,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 @@ -1208,7 +1208,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1223,7 +1223,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1422,7 +1422,7 @@ decq I # i -- BRANCH jg .L41 - ALIGN_4 + ALIGN_4 .L50: testq $2, M @@ -1441,7 +1441,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 1), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 @@ -1455,7 +1455,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1470,7 +1470,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1603,7 +1603,7 @@ #endif addq $4 * SIZE, CO1 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1622,7 +1622,7 @@ salq $ZBASE_SHIFT, %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 1), BO -#endif +#endif movsd -32 * SIZE(AO), %xmm0 pxor %xmm3, %xmm3 @@ -1636,7 +1636,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1651,7 +1651,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) addps %xmm3, %xmm8 @@ -1763,8 +1763,8 @@ addps %xmm0, %xmm8 #endif movsd %xmm8, 0 * SIZE(CO1) - ALIGN_4 - + ALIGN_4 + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse.S b/kernel/x86_64/zgemm_kernel_4x2_sse.S index 04dbf1ad1..7d606aa6f 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_sse.S +++ b/kernel/x86_64/zgemm_kernel_4x2_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi @@ -56,7 +56,7 @@ #define CO1 %r15 #define CO2 %rbp #define BB %r12 - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -92,7 +92,7 @@ #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5 + 8) #endif - + #if defined(PENTIUM4) || defined(GENERIC) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 @@ -274,7 +274,7 @@ addps %xmm5, %xmm14 ;\ movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ addps %xmm6, %xmm15 ;\ - movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 #define KERNEL5(xx) \ mulps %xmm0, %xmm1 ;\ @@ -336,9 +336,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -396,7 +396,7 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R - + movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm7, %xmm1 @@ -423,7 +423,7 @@ movsd %xmm12, KK #ifndef LEFT negq KK -#endif +#endif #endif salq $ZBASE_SHIFT, LDC @@ -436,19 +436,19 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm7 - + movq K, %rax sarq $2, %rax jle .L03 addq %rax, %rax ALIGN_4 - + .L02: PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) @@ -475,7 +475,7 @@ PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 @@ -521,7 +521,7 @@ shufps $0, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 #else @@ -539,7 +539,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -564,7 +564,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -32 * SIZE(BO), %xmm1 @@ -594,7 +594,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -971,7 +971,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -988,7 +988,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -16 * SIZE(AO), %xmm2 @@ -1010,7 +1010,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1232,8 +1232,8 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L30: testq $1, M je .L39 @@ -1249,7 +1249,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 8), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 movaps -24 * SIZE(AO), %xmm2 @@ -1269,7 +1269,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1496,7 +1496,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1517,17 +1517,17 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO movaps POSINV, %xmm7 - + movq K, %rax sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movss 0 * SIZE(B), %xmm8 movss 1 * SIZE(B), %xmm9 @@ -1548,7 +1548,7 @@ shufps $0, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 xorps %xmm7, %xmm11 xorps %xmm7, %xmm13 @@ -1599,7 +1599,7 @@ shufps $0, %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ - defined(TN) || defined(TT) || defined(TR) || defined(TC) + defined(TN) || defined(TT) || defined(TR) || defined(TC) xorps %xmm7, %xmm9 #else xorps %xmm7, %xmm8 @@ -1613,7 +1613,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1636,7 +1636,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1663,7 +1663,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1889,7 +1889,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1906,7 +1906,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1925,7 +1925,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2078,8 +2078,8 @@ #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L70: testq $1, M je .L999 @@ -2095,7 +2095,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps -32 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -2114,7 +2114,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2261,7 +2261,7 @@ addps %xmm0, %xmm8 #endif movlps %xmm8, 0 * SIZE(CO1) - ALIGN_4 + ALIGN_4 .L999: movq %rbx, %rsp diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse3.S b/kernel/x86_64/zgemm_kernel_4x2_sse3.S index ecc3a6f05..4e5504c42 100644 --- a/kernel/x86_64/zgemm_kernel_4x2_sse3.S +++ b/kernel/x86_64/zgemm_kernel_4x2_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -54,7 +54,7 @@ #define CO1 %r14 #define CO2 %r15 #define BB %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -79,7 +79,7 @@ #define KK 48(%rsp) #define KKK 56(%rsp) #define BUFFER 128(%rsp) - + #define PREFETCH prefetcht0 #define PREFETCHSIZE 320 @@ -334,9 +334,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -391,7 +391,7 @@ shufps $0, %xmm0, %xmm0 movaps %xmm0, 0 + ALPHA_R - + movss %xmm1, 4 + ALPHA_I movss %xmm1, 12 + ALPHA_I xorps %xmm15, %xmm1 @@ -403,7 +403,7 @@ movsd %xmm4, KK #ifndef LEFT negq KK -#endif +#endif #endif salq $ZBASE_SHIFT, LDC @@ -416,16 +416,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 @@ -473,7 +473,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc @@ -501,7 +501,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -529,7 +529,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -543,7 +543,7 @@ andq $-8, %rax salq $4, %rax je .L15 - + .L1X: KERNEL1 (32 * 0) KERNEL2 (32 * 0) @@ -882,7 +882,7 @@ addq $8 * SIZE, CO2 # coffset += 4 decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -899,7 +899,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -918,7 +918,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1150,8 +1150,8 @@ addq $4 * SIZE, CO1 # coffset += 4 addq $4 * SIZE, CO2 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L30: testq $1, M je .L39 @@ -1167,7 +1167,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 4), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1185,7 +1185,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1317,12 +1317,12 @@ movhlps %xmm6, %xmm1 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) + defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm1 -#endif - +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 @@ -1369,7 +1369,7 @@ #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif - ALIGN_4 + ALIGN_4 .L39: #if defined(TRMMKERNEL) && !defined(LEFT) @@ -1390,16 +1390,16 @@ #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + movq K, %rax sarq $3, %rax jle .L43 ALIGN_4 - + .L42: movddup 0 * SIZE(B), %xmm0 movddup 2 * SIZE(B), %xmm1 @@ -1445,7 +1445,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: movq C, CO1 # coffset1 = c movq A, AO # aoffset = a @@ -1467,7 +1467,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 4), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1488,7 +1488,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1727,7 +1727,7 @@ addq $8 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1744,7 +1744,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 2), AO leaq (BO, %rax, 2), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1758,7 +1758,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -1907,8 +1907,8 @@ addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 - ALIGN_4 - + ALIGN_4 + .L70: testq $1, M je .L999 @@ -1924,7 +1924,7 @@ leaq (, %rax, 8), %rax leaq (AO, %rax, 1), AO leaq (BO, %rax, 2), BO -#endif +#endif movddup 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1938,7 +1938,7 @@ #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax - movq %rax, KKK + movq %rax, KKK #else movq KK, %rax #ifdef LEFT @@ -2033,12 +2033,12 @@ movhlps %xmm0, %xmm1 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) + defined(RR) || defined(RC) || defined(CR) || defined(CC) cmpeqps %xmm7, %xmm7 pslld $31, %xmm7 xorps %xmm7, %xmm1 -#endif - +#endif + #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) shufps $0xb1, %xmm1, %xmm1 @@ -2068,7 +2068,7 @@ addps %xmm8, %xmm0 #endif movsd %xmm0, 0 * SIZE(CO1) - ALIGN_4 + ALIGN_4 .L999: movq %rbx, %rsp diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index 2cafb1f60..dbde1f0b5 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -13,19 +13,19 @@ notice, this list of conditions and the following disclaimer. notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the ISCAS nor the names of its contributors may -be used to endorse or promote products derived from this software +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -59,7 +59,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef WINDOWS_ABI -#define STACKSIZE 128 +#define STACKSIZE 128 #define old_ldc 8+STACKSIZE(%rsp) #define old_offset 16+STACKSIZE(%rsp) @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OR orq #define JNE jne #define JMP jmp -#define NOP +#define NOP #define XOR xorpd #undef MOVQ #define MOVQ movq @@ -207,7 +207,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ADD1_DX SUB_DX #define ADD1_DY SUB_DY #define ADD2_DY ADDSUB_DY -#else +#else #define ADD1_DX ADD_DX #define ADD1_DY ADD_DY #define ADD2_DY ADDSUB_DY @@ -289,7 +289,7 @@ SALQ $6, k; LEAQ (bb, k, 1), prebb; # Rn=4 SIZE=8 COMPLEX=2 MOVQ ba,ptrba; MOVQ bm,i; -SARQ $2,i; # Rm = 4 +SARQ $2,i; # Rm = 4 JLE .L1_loopE; ALIGN_5; .L1_bodyB:; @@ -333,7 +333,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -764,7 +764,7 @@ SUB_DY yvec11, yvec7, yvec11; SUB_DY yvec10, yvec7, yvec10; SUB_DY yvec9, yvec7, yvec9; SUB_DY yvec8, yvec7, yvec8; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; @@ -882,7 +882,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE,C0; @@ -996,7 +996,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; @@ -1032,7 +1032,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -1304,7 +1304,7 @@ SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; SUB_DY yvec13, yvec7, yvec13; SUB_DY yvec12, yvec7, yvec12; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; @@ -1374,7 +1374,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; @@ -1437,7 +1437,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; @@ -1468,7 +1468,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $4, %rax; #endif MOVQ %rax, kkk; @@ -1634,7 +1634,7 @@ ADDSUB_DY yvec14, yvec7, yvec14; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; ADDSUB_DY yvec15, yvec7, yvec15; @@ -1669,7 +1669,7 @@ ADD_DX xvec0, xvec15, xvec15; ADD_DX xvec1, xvec7, xvec7; ADD_DX xvec2, xvec14, xvec14; ADD_DX xvec3, xvec6, xvec6; -#endif +#endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); STL_DX xvec7, 0*SIZE(C0, ldc, 1); @@ -1685,7 +1685,7 @@ SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; @@ -1742,7 +1742,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -1994,7 +1994,7 @@ SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; SUB_DY yvec13, yvec7, yvec13; SUB_DY yvec12, yvec7, yvec12; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; VPERMILP_DY $0x05, yvec13, yvec13; @@ -2032,7 +2032,7 @@ EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; -#### Testing Alignment #### +#### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; @@ -2064,7 +2064,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; @@ -2125,7 +2125,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; @@ -2159,7 +2159,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -2336,7 +2336,7 @@ ADDSUB_DY yvec13, yvec7, yvec13; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec13, yvec7, yvec13; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec13, yvec13; ADDSUB_DY yvec15, yvec7, yvec15; @@ -2389,7 +2389,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; @@ -2420,7 +2420,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $2, %rax; #endif MOVQ %rax, kkk; @@ -2542,7 +2542,7 @@ XOR_DY yvec7, yvec7, yvec7; ADDSUB_DY yvec15, yvec7, yvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; ADDSUB_DY yvec15, yvec7, yvec15; VPERMILP_DY $0x05, yvec15, yvec15; @@ -2577,7 +2577,7 @@ SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; @@ -2627,7 +2627,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -2797,7 +2797,7 @@ ADDSUB_DY yvec14, yvec7, yvec14; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; SUB_DY yvec14, yvec7, yvec14; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; VPERMILP_DY $0x05, yvec14, yvec14; ADDSUB_DY yvec15, yvec7, yvec15; @@ -2850,7 +2850,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; @@ -2882,7 +2882,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -2986,7 +2986,7 @@ XOR_DY yvec7, yvec7, yvec7; ADDSUB_DY yvec15, yvec7, yvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_DY yvec15, yvec7, yvec15; -#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_DY $0x05, yvec15, yvec15; ADDSUB_DY yvec15, yvec7, yvec15; VPERMILP_DY $0x05, yvec15, yvec15; @@ -3021,7 +3021,7 @@ SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; @@ -3050,7 +3050,7 @@ MOVQ %rax, kkk; MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; -#else +#else ADDQ $1, %rax; #endif MOVQ %rax, kkk; @@ -3194,7 +3194,7 @@ SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif -#if defined(TRMMKERNEL) && defined(LEFT) +#if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; diff --git a/kernel/x86_64/zgemm_ncopy_1.S b/kernel/x86_64/zgemm_ncopy_1.S index 9f9ae7369..60b51f53e 100644 --- a/kernel/x86_64/zgemm_ncopy_1.S +++ b/kernel/x86_64/zgemm_ncopy_1.S @@ -74,7 +74,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r14 pushq %r13 diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S index 8876b61ff..845198226 100644 --- a/kernel/x86_64/zgemm_ncopy_2.S +++ b/kernel/x86_64/zgemm_ncopy_2.S @@ -92,7 +92,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r14 pushq %r13 diff --git a/kernel/x86_64/zgemm_tcopy_1.S b/kernel/x86_64/zgemm_tcopy_1.S index b4348e60e..02c061493 100644 --- a/kernel/x86_64/zgemm_tcopy_1.S +++ b/kernel/x86_64/zgemm_tcopy_1.S @@ -74,7 +74,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %r14 pushq %r13 @@ -118,7 +118,7 @@ #ifndef DOUBLE movsd 0 * SIZE(AO1), %xmm0 movhps 0 * SIZE(AO1, LDA, 1), %xmm0 - + movaps %xmm0, 0 * SIZE(B) #else prefetcht0 RPREFETCHSIZE * SIZE(AO1) diff --git a/kernel/x86_64/zgemm_tcopy_2.S b/kernel/x86_64/zgemm_tcopy_2.S index f83022d26..121bbc428 100644 --- a/kernel/x86_64/zgemm_tcopy_2.S +++ b/kernel/x86_64/zgemm_tcopy_2.S @@ -85,7 +85,7 @@ PROLOGUE PROFCODE - + #ifdef WINDOWS_ABI pushq %rdi pushq %rsi diff --git a/kernel/x86_64/zgemv_n.S b/kernel/x86_64/zgemv_n.S index 71b76ac29..b903cfb26 100644 --- a/kernel/x86_64/zgemv_n.S +++ b/kernel/x86_64/zgemv_n.S @@ -43,7 +43,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) @@ -57,7 +57,7 @@ #define XX 88(%rsp) #define LDAX 96(%rsp) #define ALPHAR 104(%rsp) -#define ALPHAI 112(%rsp) +#define ALPHAI 112(%rsp) #define M %rdi #define N %rsi @@ -71,7 +71,7 @@ #else #define STACKSIZE 304 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -194,7 +194,7 @@ movlpd %xmm1, ALPHA_I subq $-16 * SIZE, A - + testq M, M jle .L999 testq N, N @@ -230,7 +230,7 @@ testq $SIZE, A jne .L100 #endif - + #if GEMV_UNROLL >= 4 cmpq $4, N @@ -260,7 +260,7 @@ pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 - shufps $0xc0, %xmm5, %xmm5 + shufps $0xc0, %xmm5, %xmm5 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 @@ -764,7 +764,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - shufps $0xc0, %xmm11, %xmm11 + shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 @@ -1083,7 +1083,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - shufps $0xc0, %xmm11, %xmm11 + shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 @@ -1316,7 +1316,7 @@ pcmpeqb %xmm5, %xmm5 psllq $63, %xmm5 - shufps $0xc0, %xmm5, %xmm5 + shufps $0xc0, %xmm5, %xmm5 pshufd $0x4e, %xmm8, %xmm9 pshufd $0x4e, %xmm10, %xmm11 @@ -1864,7 +1864,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - shufps $0xc0, %xmm11, %xmm11 + shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 pshufd $0x4e, %xmm14, %xmm15 @@ -2206,7 +2206,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 - shufps $0xc0, %xmm11, %xmm11 + shufps $0xc0, %xmm11, %xmm11 pshufd $0x4e, %xmm12, %xmm13 diff --git a/kernel/x86_64/zgemv_n_atom.S b/kernel/x86_64/zgemv_n_atom.S index 289c07670..4fa70925e 100644 --- a/kernel/x86_64/zgemv_n_atom.S +++ b/kernel/x86_64/zgemv_n_atom.S @@ -49,12 +49,12 @@ #ifndef WINDOWS_ABI #define STACKSIZE 64 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) - + #define M %rdi #define N %rsi #define A %rcx @@ -67,7 +67,7 @@ #else #define STACKSIZE 256 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -173,7 +173,7 @@ movaps %xmm1, ALPHA_I subq $-16 * SIZE, A - + testq M, M jle .L999 testq N, N diff --git a/kernel/x86_64/zgemv_n_dup.S b/kernel/x86_64/zgemv_n_dup.S index 8a49fc970..42c1963c2 100644 --- a/kernel/x86_64/zgemv_n_dup.S +++ b/kernel/x86_64/zgemv_n_dup.S @@ -43,14 +43,14 @@ #ifndef WINDOWS_ABI #define STACKSIZE 64 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) #define ALPHA_R 48 (%rsp) #define ALPHA_I 56 (%rsp) - + #define M %rdi #define N %rsi #define A %rcx @@ -63,7 +63,7 @@ #else #define STACKSIZE 256 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -149,7 +149,7 @@ movlps %xmm1, ALPHA_I subq $-16 * SIZE, A - + testq M, M jle .L999 testq N, N diff --git a/kernel/x86_64/zgemv_t.S b/kernel/x86_64/zgemv_t.S index 30f76dcbe..c78927953 100644 --- a/kernel/x86_64/zgemv_t.S +++ b/kernel/x86_64/zgemv_t.S @@ -43,12 +43,12 @@ #ifndef WINDOWS_ABI #define STACKSIZE 128 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) - + #define MMM 64(%rsp) #define NN 72(%rsp) #define AA 80(%rsp) @@ -68,7 +68,7 @@ #else #define STACKSIZE 288 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -193,7 +193,7 @@ #endif subq $-16 * SIZE, A - + testq M, M jle .L999 testq N, N @@ -201,7 +201,7 @@ ALIGN_3 movq BUFFER, X1 - + movq Y, Y1 movq M, I @@ -669,7 +669,7 @@ pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0xc0, %xmm13, %xmm13 - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm13, %xmm0 xorpd %xmm13, %xmm2 @@ -1034,7 +1034,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 xorpd %xmm11, %xmm2 @@ -1264,7 +1264,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 #else @@ -1769,7 +1769,7 @@ pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0xc0, %xmm13, %xmm13 - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm13, %xmm0 xorpd %xmm13, %xmm2 @@ -2157,7 +2157,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 xorpd %xmm11, %xmm2 @@ -2398,7 +2398,7 @@ pcmpeqb %xmm11, %xmm11 psllq $63, %xmm11 shufps $0xc0, %xmm11, %xmm11 - + #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) xorpd %xmm11, %xmm0 #else diff --git a/kernel/x86_64/zgemv_t_atom.S b/kernel/x86_64/zgemv_t_atom.S index 5d3ecdd69..73a013a24 100644 --- a/kernel/x86_64/zgemv_t_atom.S +++ b/kernel/x86_64/zgemv_t_atom.S @@ -49,12 +49,12 @@ #ifndef WINDOWS_ABI #define STACKSIZE 64 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) - + #define M %rdi #define N %rsi #define A %rcx @@ -67,7 +67,7 @@ #else #define STACKSIZE 256 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -174,7 +174,7 @@ movaps %xmm1, ALPHA_I subq $-16 * SIZE, A - + testq M, M jle .L999 testq N, N @@ -182,7 +182,7 @@ ALIGN_3 movq BUFFER, X1 - + movq Y, Y1 movq M, I diff --git a/kernel/x86_64/zgemv_t_dup.S b/kernel/x86_64/zgemv_t_dup.S index 2db99b6dd..d509f0eb8 100644 --- a/kernel/x86_64/zgemv_t_dup.S +++ b/kernel/x86_64/zgemv_t_dup.S @@ -43,12 +43,12 @@ #ifndef WINDOWS_ABI #define STACKSIZE 64 - + #define OLD_INCX 8 + STACKSIZE(%rsp) #define OLD_Y 16 + STACKSIZE(%rsp) #define OLD_INCY 24 + STACKSIZE(%rsp) #define OLD_BUFFER 32 + STACKSIZE(%rsp) - + #define M %rdi #define N %rsi #define A %rcx @@ -61,7 +61,7 @@ #else #define STACKSIZE 256 - + #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_LDA 56 + STACKSIZE(%rsp) @@ -156,7 +156,7 @@ xorps %xmm5, ALPHA_I subq $-16 * SIZE, A - + testq M, M jle .L999 testq N, N @@ -164,7 +164,7 @@ ALIGN_3 movq BUFFER, X1 - + movq Y, Y1 movq M, I @@ -606,7 +606,7 @@ pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0x40, %xmm13, %xmm13 - + #ifndef XCONJ xorps %xmm13, %xmm1 xorps %xmm13, %xmm3 @@ -935,7 +935,7 @@ pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0x40, %xmm13, %xmm13 - + #ifndef XCONJ xorps %xmm13, %xmm1 xorps %xmm13, %xmm3 @@ -1154,7 +1154,7 @@ pcmpeqb %xmm13, %xmm13 psllq $63, %xmm13 shufps $0x40, %xmm13, %xmm13 - + #ifndef XCONJ xorps %xmm13, %xmm1 #else diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index 950262611..4115eab1d 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -49,7 +49,7 @@ PROLOGUE PROFCODE - + fldz testq M, M jle .L999 @@ -68,7 +68,7 @@ sarq $2, I jle .L20 ALIGN_4 - + .L10: #if defined(PREFETCH) PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) @@ -132,7 +132,7 @@ sarq $2, I jle .L60 ALIGN_4 - + .L50: FLD 0 * SIZE(X) fmul %st(0), %st diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S index 005536a04..f78b83f7e 100644 --- a/kernel/x86_64/znrm2_sse.S +++ b/kernel/x86_64/znrm2_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M ARG1 /* rdi */ #define X ARG2 /* rsi */ #define INCX ARG3 /* rdx */ @@ -50,7 +50,7 @@ PROLOGUE PROFCODE - + SAVEREGISTERS pxor %xmm0, %xmm0 @@ -70,7 +70,7 @@ testq $SIZE, X je .L05 - + movss (X), %xmm4 cvtss2sd %xmm4, %xmm6 mulsd %xmm6, %xmm6 @@ -85,7 +85,7 @@ movq M, I sarq $3, I jle .L14 - + movsd 0 * SIZE(X), %xmm4 movsd 2 * SIZE(X), %xmm5 movsd 4 * SIZE(X), %xmm6 @@ -252,7 +252,7 @@ sarq $3, I jle .L44 ALIGN_4 - + .L41: movsd (X), %xmm4 addq INCX, X diff --git a/kernel/x86_64/zrot.S b/kernel/x86_64/zrot.S index d645d6f2e..22d031c07 100644 --- a/kernel/x86_64/zrot.S +++ b/kernel/x86_64/zrot.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define N ARG1 #define X ARG2 #define INCX ARG3 @@ -80,7 +80,7 @@ sarq $1, I jle .L15 ALIGN_4 - + .L10: #if defined(PREFETCHW) PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) @@ -225,7 +225,7 @@ sarq $1, I jle .L55 ALIGN_4 - + .L51: FLD 0 * SIZE(X) FLD 0 * SIZE(Y) diff --git a/kernel/x86_64/zrot_sse.S b/kernel/x86_64/zrot_sse.S index da79b4abe..639131152 100644 --- a/kernel/x86_64/zrot_sse.S +++ b/kernel/x86_64/zrot_sse.S @@ -124,7 +124,7 @@ movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm8 movaps 12 * SIZE(X), %xmm10 - + decq %rax jle .L12 ALIGN_3 @@ -552,7 +552,7 @@ movaps 4 * SIZE(X), %xmm2 movaps 8 * SIZE(X), %xmm8 movaps 12 * SIZE(X), %xmm10 - + decq %rax jle .L22 ALIGN_3 @@ -1026,7 +1026,7 @@ movhps 10 * SIZE(X), %xmm8 movsd 12 * SIZE(X), %xmm10 movhps 14 * SIZE(X), %xmm10 - + decq %rax jle .L32 ALIGN_3 diff --git a/kernel/x86_64/zrot_sse2.S b/kernel/x86_64/zrot_sse2.S index 368101816..e6288c3d5 100644 --- a/kernel/x86_64/zrot_sse2.S +++ b/kernel/x86_64/zrot_sse2.S @@ -99,7 +99,7 @@ movapd 2 * SIZE(X), %xmm2 movapd 4 * SIZE(X), %xmm8 movapd 6 * SIZE(X), %xmm10 - + decq %rax jle .L12 ALIGN_3 @@ -1169,7 +1169,7 @@ movapd 2 * SIZE(X), %xmm2 movapd 4 * SIZE(X), %xmm8 movapd 6 * SIZE(X), %xmm10 - + decq %rax jle .L42 ALIGN_3 diff --git a/kernel/x86_64/zscal_atom.S b/kernel/x86_64/zscal_atom.S index c01d5c14f..1649b855b 100644 --- a/kernel/x86_64/zscal_atom.S +++ b/kernel/x86_64/zscal_atom.S @@ -65,12 +65,12 @@ #endif SAVEREGISTERS - + salq $ZBASE_SHIFT, INCX testq M, M jle .L999 - + pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L30 # Alpha_r != ZERO @@ -387,7 +387,7 @@ xorq %rax, %rax RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S index 393988e73..8505c67bf 100644 --- a/kernel/x86_64/zscal_sse.S +++ b/kernel/x86_64/zscal_sse.S @@ -66,7 +66,7 @@ #endif SAVEREGISTERS - + salq $ZBASE_SHIFT, INCX xor FLAG, FLAG @@ -1040,7 +1040,7 @@ #else - + pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 subps %xmm1, %xmm15 @@ -1353,7 +1353,7 @@ xorq %rax, %rax RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index a553bbd39..223b1e439 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -72,13 +72,13 @@ #endif SAVEREGISTERS - + salq $ZBASE_SHIFT, INCX xor FLAG, FLAG testq M, M jle .L999 - + pxor %xmm15, %xmm15 comisd %xmm0, %xmm15 jne .L100 @@ -177,7 +177,7 @@ jle .L22 ALIGN_4 -.L21: +.L21: #ifdef PREFETCHW PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) #endif @@ -280,7 +280,7 @@ #endif pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 - movlhps %xmm1, %xmm15 + movlhps %xmm1, %xmm15 cmpq $2 * SIZE, INCX jne .L120 @@ -804,11 +804,11 @@ jne .L220 #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) - + movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 - movlhps %xmm1, %xmm15 + movlhps %xmm1, %xmm15 shufpd $1, %xmm15, %xmm15 movhps 0 * SIZE(X), %xmm0 @@ -1085,7 +1085,7 @@ movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 - movlhps %xmm1, %xmm15 + movlhps %xmm1, %xmm15 subq $-16 * SIZE, X @@ -1399,7 +1399,7 @@ movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 subsd %xmm1, %xmm15 - movlhps %xmm1, %xmm15 + movlhps %xmm1, %xmm15 movq X, XX @@ -1717,7 +1717,7 @@ xorq %rax, %rax RESTOREREGISTERS - + ret EPILOGUE diff --git a/kernel/x86_64/zswap.S b/kernel/x86_64/zswap.S index 8f96875e3..68568f7dd 100644 --- a/kernel/x86_64/zswap.S +++ b/kernel/x86_64/zswap.S @@ -60,7 +60,7 @@ PROLOGUE PROFCODE - + #ifndef WINDOWS_ABI #ifndef XDOUBLE movq 8(%rsp), INCY diff --git a/kernel/x86_64/zswap_sse.S b/kernel/x86_64/zswap_sse.S index 2f217592f..12f9875ba 100644 --- a/kernel/x86_64/zswap_sse.S +++ b/kernel/x86_64/zswap_sse.S @@ -86,7 +86,7 @@ subq $-32 * SIZE, X subq $-32 * SIZE, Y - + cmpq $3, M jle .L16 @@ -312,7 +312,7 @@ .L20: movaps -33 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) pshufd $0x39, %xmm1, %xmm3 movlps %xmm3, -31 * SIZE(X) @@ -796,7 +796,7 @@ .L40: movaps -35 * SIZE(X), %xmm0 movaps -32 * SIZE(Y), %xmm1 - + movss %xmm1, -32 * SIZE(X) subq $3, M diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 204e5e6ab..3a5243bab 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -91,7 +91,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -99,14 +99,14 @@ #define M ARG1 #define N ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 5769d242a..295ab1a83 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -97,7 +97,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -105,14 +105,14 @@ #define M ARG1 #define N ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 6f782b1e2..cf302e4ed 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -93,11 +93,11 @@ #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 14) #endif - + #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -105,14 +105,14 @@ #define M ARG1 #define N ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index f92779e24..7c290137d 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -97,7 +97,7 @@ #ifndef WINDOWS_ABI #define STACKSIZE 80 - + #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) @@ -105,14 +105,14 @@ #define M ARG1 #define IS ARG2 #define A ARG3 -#define LDA ARG4 +#define LDA ARG4 #define X ARG5 -#define INCX ARG6 +#define INCX ARG6 #else #define STACKSIZE 256 - + #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S index 31bd57b43..d3bedff12 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -55,7 +55,7 @@ #define CO1 %r15 #define BB %rbx #define KK %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 128 @@ -107,9 +107,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -144,7 +144,7 @@ movq OLD_OFFSET, KK movq KK, OFFSET - + salq $ZBASE_SHIFT, LDC #ifdef LN @@ -168,7 +168,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, KK @@ -203,7 +203,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -229,7 +229,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -502,7 +502,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -547,7 +547,7 @@ ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + ADDSD4 %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 @@ -943,7 +943,7 @@ #endif decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN @@ -965,7 +965,7 @@ decq J # j -- jg .L01 ALIGN_4 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S index 065abe0ce..80485c0de 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -84,7 +84,7 @@ #define AORIG 48(%rsp) #define BORIG 56(%rsp) #define BUFFER 128(%rsp) - + #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) @@ -106,9 +106,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -186,7 +186,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -204,10 +204,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq 16 * SIZE + BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -220,7 +220,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -238,7 +238,7 @@ addq %rax, %rax ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) @@ -300,7 +300,7 @@ decq %rax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movq A, AO @@ -342,7 +342,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -709,7 +709,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif prefetcht2 0 * SIZE(BB) @@ -752,7 +752,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 @@ -1310,7 +1310,7 @@ decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN @@ -1345,7 +1345,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -1361,7 +1361,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1377,7 +1377,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -1427,7 +1427,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movq A, AO @@ -1468,7 +1468,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1740,7 +1740,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2106,7 +2106,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L199: #ifdef LN @@ -2130,7 +2130,7 @@ #endif ALIGN_4 - + .L999: movq %r15, %rsp diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S index 093a580ba..3a691ca8c 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -98,7 +98,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -185,7 +185,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -199,7 +199,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax @@ -228,7 +228,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -253,7 +253,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -554,7 +554,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: movq M, I @@ -578,7 +578,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -622,7 +622,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 @@ -1267,7 +1267,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -1307,7 +1307,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -1320,7 +1320,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT, %rax @@ -1329,7 +1329,7 @@ #ifdef LT movq OFFSET, KK #endif - + testq $1, M BRANCH jle .L60 @@ -1349,7 +1349,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1369,7 +1369,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -1586,7 +1586,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: movq M, I @@ -1610,7 +1610,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB @@ -1636,7 +1636,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 @@ -1966,7 +1966,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L79: #ifdef LN diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S index 79f20b641..542bd5947 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -55,7 +55,7 @@ #define BO %rsi #define CO1 %r15 #define CO2 %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -226,9 +226,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -309,7 +309,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -327,11 +327,11 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -344,7 +344,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -362,7 +362,7 @@ addq %rax, %rax ALIGN_4 - + .L02: PREFETCHNTA 56 * SIZE(B) @@ -431,7 +431,7 @@ decq %rax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movq A, AO @@ -473,7 +473,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -859,7 +859,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -888,7 +888,7 @@ PREFETCHW 4 * SIZE(CO2) pxor %xmm7, %xmm7 #endif - + #if defined(LT) || defined(RN) movq KK, %rax #else @@ -898,7 +898,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -1422,9 +1422,9 @@ decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 + - .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -1458,7 +1458,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO @@ -1475,7 +1475,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1491,7 +1491,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 @@ -1551,7 +1551,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movq A, AO @@ -1592,7 +1592,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -1833,7 +1833,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L130: movq M, I sarq $1, I # i = (m >> 2) @@ -1860,7 +1860,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -1872,7 +1872,7 @@ PREFETCHW 4 * SIZE(CO1) #endif - + #if defined(LT) || defined(RN) movq KK, %rax #else @@ -2224,7 +2224,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L199: #ifdef LN diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S index 74a799af4..7547421ef 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -55,7 +55,7 @@ #define CO1 %r15 #define CO2 %rbx #define KK %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 128 @@ -338,9 +338,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -375,7 +375,7 @@ movq OLD_OFFSET, KK movq KK, OFFSET - + salq $ZBASE_SHIFT, LDC #ifdef LN @@ -399,7 +399,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, KK @@ -422,7 +422,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -437,7 +437,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -459,7 +459,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -884,7 +884,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -921,8 +921,8 @@ andq $-8, %rax salq $4, %rax je .L12 - -.L1X: + +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -1403,7 +1403,7 @@ #endif decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L99: #ifdef LN @@ -1440,7 +1440,7 @@ movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif movq C, CO1 # coffset1 = c @@ -1451,7 +1451,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -1474,7 +1474,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1756,7 +1756,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2153,7 +2153,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L149: #ifdef LN @@ -2173,7 +2173,7 @@ subq $1, KK #endif ALIGN_3 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S index fc5a4a317..5d931cea8 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -98,7 +98,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -185,7 +185,7 @@ movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -199,7 +199,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax @@ -229,7 +229,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -254,7 +254,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -780,7 +780,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: movq M, I @@ -806,11 +806,11 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB - + xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -842,7 +842,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 @@ -1585,7 +1585,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L29: #ifdef LN @@ -1626,7 +1626,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -1640,7 +1640,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -1666,7 +1666,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1691,7 +1691,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1978,7 +1978,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L40: movq M, I @@ -2004,7 +2004,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -2030,7 +2030,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -2444,7 +2444,7 @@ decq I # i -- BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L49: #ifdef LN @@ -2481,7 +2481,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -2493,7 +2493,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -2519,7 +2519,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2539,7 +2539,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -2728,7 +2728,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: movq M, I @@ -2754,7 +2754,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -2779,7 +2779,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -3065,7 +3065,7 @@ decq I # i -- BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L69: #ifdef LN diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index f5c100ec1..cd86db289 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -225,9 +225,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -314,7 +314,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -332,10 +332,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -348,7 +348,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -364,7 +364,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -448,7 +448,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -490,7 +490,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movlps 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 @@ -946,7 +946,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -971,7 +971,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1506,7 +1506,7 @@ salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L30: movq M, I @@ -1534,7 +1534,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 @@ -1566,7 +1566,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -2358,7 +2358,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -2394,7 +2394,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -2410,7 +2410,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -2426,7 +2426,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -2483,7 +2483,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -2523,7 +2523,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 @@ -2796,7 +2796,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -2821,7 +2821,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3186,7 +3186,7 @@ salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L70: movq M, I @@ -3214,7 +3214,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3950,7 +3950,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L79: #ifdef LN diff --git a/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S index e53e29759..874d34db3 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S +++ b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -97,7 +97,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -187,7 +187,7 @@ movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -201,7 +201,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax @@ -231,7 +231,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetchnta -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -267,7 +267,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 @@ -859,7 +859,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -873,7 +873,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -899,7 +899,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -925,7 +925,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -1275,7 +1275,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif movq C, CO1 @@ -1286,7 +1286,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -1312,7 +1312,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -1335,7 +1335,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -1555,7 +1555,7 @@ #ifdef RT subq $1, KK #endif - ALIGN_4 + ALIGN_4 .L999: movq 0(%rsp), %rbx diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S index a1760adf1..92dc6366b 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -55,7 +55,7 @@ #define CO1 %r15 #define BB %rbx #define KK %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 128 @@ -107,9 +107,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -144,7 +144,7 @@ movq OLD_OFFSET, KK movq KK, OFFSET - + salq $ZBASE_SHIFT, LDC #ifdef LN @@ -168,7 +168,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, KK @@ -203,7 +203,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -232,7 +232,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht0 0 * SIZE(BB) subq $-8 * SIZE, BB @@ -277,7 +277,7 @@ ADDSD3 %xmm7, %xmm14 movsd 3 * SIZE(AO), %xmm7 mulsd %xmm3, %xmm2 - + ADDSD4 %xmm6, %xmm15 PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) movaps %xmm4, %xmm6 @@ -673,7 +673,7 @@ #endif decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -693,7 +693,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movsd 0 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -944,7 +944,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -965,7 +965,7 @@ decq J # j -- jg .L01 ALIGN_4 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S index 93cbcad2d..6cf850609 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -84,7 +84,7 @@ #define AORIG 48(%rsp) #define BORIG 56(%rsp) #define BUFFER 128(%rsp) - + #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) @@ -106,9 +106,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -186,7 +186,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -204,10 +204,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq 16 * SIZE + BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -220,7 +220,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -238,7 +238,7 @@ addq %rax, %rax ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) @@ -300,7 +300,7 @@ decq %rax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movq A, AO @@ -347,7 +347,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif prefetcht2 0 * SIZE(BB) @@ -390,7 +390,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 @@ -948,7 +948,7 @@ decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -973,7 +973,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1311,7 +1311,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -1345,7 +1345,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -1361,7 +1361,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1377,7 +1377,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -1427,7 +1427,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movq A, AO @@ -1469,7 +1469,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -1835,7 +1835,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -1862,7 +1862,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2107,7 +2107,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L199: #ifdef LN leaq (, K, SIZE), %rax @@ -2130,7 +2130,7 @@ #endif ALIGN_4 - + .L999: movq %r15, %rsp diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S index e38e87ec9..007811758 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -98,7 +98,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -185,7 +185,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -199,7 +199,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax @@ -230,7 +230,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -274,7 +274,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 @@ -919,7 +919,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -941,7 +941,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -966,7 +966,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -1267,7 +1267,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -1307,7 +1307,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -1320,7 +1320,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax @@ -1329,7 +1329,7 @@ #ifdef LT movq OFFSET, KK #endif - + movq M, I sarq $1, I # i = (m >> 2) NOBRANCH @@ -1351,7 +1351,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-4 * SIZE, BB @@ -1377,7 +1377,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 @@ -1707,7 +1707,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -1729,7 +1729,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -1749,7 +1749,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -1966,7 +1966,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L79: #ifdef LN diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S index 18edeed57..d90bfd8a5 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -55,7 +55,7 @@ #define BO %rsi #define CO1 %r15 #define CO2 %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -213,7 +213,7 @@ movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ addpd %xmm14, %xmm7 ;\ movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 - + #ifndef CONJ #define NN @@ -227,9 +227,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -310,7 +310,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -328,10 +328,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -344,7 +344,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -362,7 +362,7 @@ addq %rax, %rax ALIGN_4 - + .L02: PREFETCHNTA 56 * SIZE(B) @@ -431,7 +431,7 @@ decq %rax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movq A, AO @@ -476,7 +476,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -508,7 +508,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -1032,7 +1032,7 @@ decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1057,7 +1057,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -1416,7 +1416,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -1450,7 +1450,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO @@ -1467,7 +1467,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1483,7 +1483,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 @@ -1543,7 +1543,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movq A, AO @@ -1585,7 +1585,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -1944,7 +1944,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -1971,7 +1971,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -2212,7 +2212,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L199: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S index 708a984da..c52b058bf 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -55,7 +55,7 @@ #define CO1 %r15 #define CO2 %rbx #define KK %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 128 @@ -338,9 +338,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -375,7 +375,7 @@ movq OLD_OFFSET, KK movq KK, OFFSET - + salq $ZBASE_SHIFT, LDC #ifdef LN @@ -399,7 +399,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, KK @@ -422,7 +422,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -437,7 +437,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -462,7 +462,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -494,8 +494,8 @@ andq $-8, %rax salq $4, %rax je .L12 - -.L1X: + +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -976,7 +976,7 @@ #endif decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -996,7 +996,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1399,7 +1399,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -1435,7 +1435,7 @@ movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif movq C, CO1 # coffset1 = c @@ -1446,7 +1446,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -1471,7 +1471,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1864,7 +1864,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -1885,7 +1885,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2164,7 +2164,7 @@ subq $1, KK #endif ALIGN_3 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S index d07930dec..0d6531ac8 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -98,7 +98,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -185,7 +185,7 @@ movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -199,7 +199,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax @@ -232,11 +232,11 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB - + xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -268,7 +268,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 @@ -1011,7 +1011,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -1034,7 +1034,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1059,7 +1059,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1585,8 +1585,8 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L29: #ifdef LN movq K, %rax @@ -1626,7 +1626,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -1640,7 +1640,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -1669,7 +1669,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -1695,7 +1695,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -2109,7 +2109,7 @@ decq I # i -- BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -2132,7 +2132,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2157,7 +2157,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -2444,8 +2444,8 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L49: #ifdef LN movq K, %rax @@ -2481,7 +2481,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -2493,7 +2493,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -2522,7 +2522,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -2547,7 +2547,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -2833,7 +2833,7 @@ decq I # i -- BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -2856,7 +2856,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2876,7 +2876,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -3065,8 +3065,8 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L69: #ifdef LN movq K, %rax diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index f58cecdf5..53e5bb7f9 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -225,9 +225,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -314,7 +314,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -332,10 +332,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -348,7 +348,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -364,7 +364,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -448,7 +448,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -493,7 +493,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 @@ -525,7 +525,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -1317,7 +1317,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -1342,7 +1342,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1877,8 +1877,8 @@ salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L30: testq $1, M je .L39 @@ -1902,7 +1902,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movlps 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 @@ -2358,7 +2358,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L39: #ifdef LN @@ -2394,7 +2394,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -2410,7 +2410,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -2426,7 +2426,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -2483,7 +2483,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -2526,7 +2526,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3262,7 +3262,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -3287,7 +3287,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3652,8 +3652,8 @@ salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L70: testq $1, M je .L79 @@ -3677,7 +3677,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 @@ -3950,7 +3950,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L79: #ifdef LN diff --git a/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S index 451aafad7..a65c2718b 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S +++ b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -97,7 +97,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -184,7 +184,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif movq C, CO1 @@ -195,7 +195,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -221,7 +221,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -244,7 +244,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -464,7 +464,7 @@ #ifdef RT subq $1, KK #endif - ALIGN_4 + ALIGN_4 .L20: testq $2, N @@ -481,7 +481,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -495,7 +495,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -521,7 +521,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -16 * SIZE(AO), %xmm0 @@ -547,7 +547,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -900,7 +900,7 @@ movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -914,7 +914,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax @@ -944,7 +944,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetchnta -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -980,7 +980,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S index 005b65eb7..0702b0088 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -49,7 +49,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -84,7 +84,7 @@ #define AORIG 48(%rsp) #define BORIG 56(%rsp) #define BUFFER 128(%rsp) - + #define PREFETCH_R (8 * 4 + 0) #define PREFETCH_W (PREFETCH_R) @@ -106,9 +106,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -186,7 +186,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -202,7 +202,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -218,7 +218,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -234,7 +234,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movddup -16 * SIZE(B), %xmm8 movddup -15 * SIZE(B), %xmm9 @@ -284,7 +284,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movq A, AO @@ -326,7 +326,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -692,7 +692,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -719,7 +719,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -964,7 +964,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L199: #ifdef LN leaq (, K, SIZE), %rax @@ -998,10 +998,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq 16 * SIZE + BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -1014,7 +1014,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1032,7 +1032,7 @@ addq %rax, %rax ALIGN_4 - + .L02: prefetcht0 (PREFETCH_R + 0) * SIZE(B) @@ -1094,7 +1094,7 @@ decq %rax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movq A, AO @@ -1141,7 +1141,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif prefetcht2 0 * SIZE(BB) @@ -1184,7 +1184,7 @@ jle .L15 ALIGN_4 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) movapd -16 * SIZE(AO), %xmm0 @@ -1742,7 +1742,7 @@ decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1767,7 +1767,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 @@ -2105,7 +2105,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L99: #ifdef LN leaq (, K, SIZE), %rax diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S index 4ed789a94..7770f5d9c 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -98,7 +98,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -182,7 +182,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -195,12 +195,12 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif - + movq M, I sarq $1, I # i = (m >> 2) NOBRANCH @@ -222,7 +222,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -14 * SIZE(AO), %xmm1 @@ -245,7 +245,7 @@ jle .L55 ALIGN_4 -.L52: +.L52: movaps %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm7 mulpd %xmm0, %xmm2 @@ -575,7 +575,7 @@ decq I BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -597,7 +597,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 pxor %xmm8, %xmm8 @@ -617,7 +617,7 @@ jle .L65 ALIGN_4 -.L62: +.L62: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) pshufd $0x4e, %xmm2, %xmm7 @@ -834,7 +834,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L79: #ifdef LN @@ -873,7 +873,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -887,7 +887,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 1, %rax @@ -919,7 +919,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetcht2 -16 * SIZE(BB) subq $-8 * SIZE, BB @@ -963,7 +963,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 ADD1 %xmm4, %xmm14 @@ -1608,7 +1608,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -1630,7 +1630,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movaps -16 * SIZE(AO), %xmm0 movaps -16 * SIZE(BO), %xmm2 @@ -1655,7 +1655,7 @@ jle .L25 ALIGN_4 -.L22: +.L22: ADD1 %xmm3, %xmm12 movaps -14 * SIZE(BO), %xmm3 pshufd $0x4e, %xmm2, %xmm7 @@ -1956,7 +1956,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L39: #ifdef LN diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S index 1b589e0cf..2dffe2d32 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -55,7 +55,7 @@ #define BO %rsi #define CO1 %r15 #define CO2 %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 64 @@ -226,9 +226,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -309,7 +309,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -325,7 +325,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO @@ -342,7 +342,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 2), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -358,7 +358,7 @@ sarq $2, %rax jle .L103 ALIGN_4 - + .L102: movlpd 0 * SIZE(B), %xmm0 movlpd 1 * SIZE(B), %xmm1 @@ -418,7 +418,7 @@ decq %rax jne .L104 ALIGN_4 - + .L105: #if defined(LT) || defined(RN) movq A, AO @@ -460,7 +460,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -819,7 +819,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -846,7 +846,7 @@ movq KK, %rax salq $0 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -1087,7 +1087,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L199: #ifdef LN leaq (, K, SIZE), %rax @@ -1110,7 +1110,7 @@ #endif ALIGN_4 -.L100: +.L100: movq N, J sarq $1, J # j = (n >> 2) jle .L999 @@ -1121,11 +1121,11 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif /* Copying to Sub Buffer */ leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -1138,7 +1138,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1156,7 +1156,7 @@ addq %rax, %rax ALIGN_4 - + .L02: PREFETCHNTA 56 * SIZE(B) @@ -1225,7 +1225,7 @@ decq %rax jne .L04 ALIGN_4 - + .L05: #if defined(LT) || defined(RN) movq A, AO @@ -1270,7 +1270,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1302,7 +1302,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -1826,7 +1826,7 @@ decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1851,7 +1851,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 2), BO -#endif +#endif pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 @@ -2210,7 +2210,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -2235,7 +2235,7 @@ decq J # j -- jg .L01 ALIGN_3 - + .L999: movq %rbx, %rsp movq 0(%rsp), %rbx diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S index ca700eb94..a473df4dc 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define M %rdi #define N %rsi #define K %rdx @@ -55,7 +55,7 @@ #define CO1 %r15 #define CO2 %rbx #define KK %rbp - + #ifndef WINDOWS_ABI #define STACKSIZE 128 @@ -338,9 +338,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -375,7 +375,7 @@ movq OLD_OFFSET, KK movq KK, OFFSET - + salq $ZBASE_SHIFT, LDC #ifdef LN @@ -399,7 +399,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, KK @@ -420,7 +420,7 @@ movq K, %rax salq $0 + ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif movq C, CO1 # coffset1 = c @@ -431,7 +431,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -456,7 +456,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -849,7 +849,7 @@ decq I # i -- jg .L110 - ALIGN_4 + ALIGN_4 .L130: testq $1, M @@ -870,7 +870,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1168,7 +1168,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -1183,7 +1183,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK #endif @@ -1208,7 +1208,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -1240,8 +1240,8 @@ andq $-8, %rax salq $4, %rax je .L12 - -.L1X: + +.L1X: KERNEL1 (16 * 0) KERNEL2 (16 * 0) KERNEL3 (16 * 0) @@ -1722,7 +1722,7 @@ #endif decq I # i -- jg .L10 - ALIGN_4 + ALIGN_4 .L30: testq $1, M @@ -1742,7 +1742,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif movapd 0 * SIZE(AO), %xmm8 pxor %xmm0, %xmm0 @@ -2145,7 +2145,7 @@ addq %rax, AORIG #endif ALIGN_4 - + .L99: #ifdef LN leaq (, K, SIZE), %rax @@ -2166,7 +2166,7 @@ decq J # j -- jg .L01 - + .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S index a5f01340b..ddb5fe098 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define OLD_K %rdx @@ -51,7 +51,7 @@ #define B %r8 #define C %r9 #define LDC %r10 - + #define I %r11 #define AO %rdi #define BO %rsi @@ -98,7 +98,7 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) @@ -182,7 +182,7 @@ movq K, %rax salq $ZBASE_SHIFT, %rax subq %rax, B - + subq LDC, C #endif @@ -194,7 +194,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -223,7 +223,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -248,7 +248,7 @@ jle .L55 ALIGN_3 -.L52: +.L52: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -534,7 +534,7 @@ decq I # i -- BRANCH jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $1, M @@ -557,7 +557,7 @@ leaq (B, %rax, 1), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -577,7 +577,7 @@ jle .L65 ALIGN_3 -.L62: +.L62: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -766,8 +766,8 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L69: #ifdef LN movq K, %rax @@ -803,7 +803,7 @@ movq K, %rax salq $1 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 2), %rax subq %rax, C #endif @@ -817,7 +817,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif #ifdef LT movq OFFSET, KK @@ -846,7 +846,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 @@ -872,7 +872,7 @@ jle .L35 ALIGN_3 -.L32: +.L32: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm8 @@ -1286,7 +1286,7 @@ decq I # i -- BRANCH jg .L31 - ALIGN_4 + ALIGN_4 .L40: testq $1, M @@ -1309,7 +1309,7 @@ leaq (B, %rax, 2), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -1334,7 +1334,7 @@ jle .L45 ALIGN_3 -.L42: +.L42: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -1621,8 +1621,8 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L49: #ifdef LN movq K, %rax @@ -1661,7 +1661,7 @@ movq K, %rax salq $2 + ZBASE_SHIFT, %rax subq %rax, B - + leaq (, LDC, 4), %rax subq %rax, C #endif @@ -1675,7 +1675,7 @@ #ifdef LN movq OFFSET, KK addq M, KK -#endif +#endif movq K, %rax salq $ZBASE_SHIFT + 2, %rax @@ -1708,11 +1708,11 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif prefetchnta -32 * SIZE(BB) subq $-16 * SIZE, BB - + xorps %xmm1, %xmm1 movaps -32 * SIZE(AO), %xmm0 xorps %xmm2, %xmm2 @@ -1744,7 +1744,7 @@ jle .L15 ALIGN_3 -.L12: +.L12: PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) ADD1 %xmm1, %xmm12 @@ -2487,7 +2487,7 @@ decq I # i -- BRANCH jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $1, M @@ -2510,7 +2510,7 @@ leaq (B, %rax, 4), BO #else movq B, BO -#endif +#endif xorps %xmm1, %xmm1 movddup -32 * SIZE(AO), %xmm0 @@ -2535,7 +2535,7 @@ jle .L25 ALIGN_3 -.L22: +.L22: ADD1 %xmm1, %xmm8 pshufd $0xa0, %xmm5, %xmm1 mulps %xmm0, %xmm1 @@ -3061,8 +3061,8 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L29: #ifdef LN movq K, %rax diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index 2c47ce3fd..20b93e198 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -38,7 +38,7 @@ #define ASSEMBLER #include "common.h" - + #define OLD_M %rdi #define OLD_N %rsi #define M %r13 @@ -225,9 +225,9 @@ PROLOGUE PROFCODE - + subq $STACKSIZE, %rsp - + movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) @@ -315,7 +315,7 @@ #ifdef RN negq KK -#endif +#endif #ifdef RT movq N, %rax @@ -331,7 +331,7 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO @@ -347,7 +347,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 1), B leaq (BO, %rax, 4), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -363,7 +363,7 @@ sarq $2, %rax jle .L43 ALIGN_4 - + .L42: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -420,7 +420,7 @@ decq %rax jne .L44 ALIGN_4 - + .L50: #if defined(LT) || defined(RN) movq A, AO @@ -463,7 +463,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1199,7 +1199,7 @@ decq I # i -- jg .L51 - ALIGN_4 + ALIGN_4 .L60: testq $2, M @@ -1224,7 +1224,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -1589,8 +1589,8 @@ salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L70: testq $1, M je .L79 @@ -1614,7 +1614,7 @@ movq KK, %rax salq $ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 @@ -1887,7 +1887,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L79: #ifdef LN @@ -1922,10 +1922,10 @@ movq OFFSET, %rax addq M, %rax movq %rax, KK -#endif +#endif leaq BUFFER, BO - + #ifdef RT movq K, %rax salq $1 + ZBASE_SHIFT, %rax @@ -1938,7 +1938,7 @@ salq $ZBASE_SHIFT, %rax leaq (B, %rax, 2), B leaq (BO, %rax, 8), BO -#endif +#endif #if defined(LT) movq OFFSET, %rax @@ -1954,7 +1954,7 @@ sarq $2, %rax jle .L03 ALIGN_4 - + .L02: movaps 0 * SIZE(B), %xmm3 movaps 4 * SIZE(B), %xmm7 @@ -2038,7 +2038,7 @@ decq %rax jne .L04 ALIGN_4 - + .L10: #if defined(LT) || defined(RN) movq A, AO @@ -2083,7 +2083,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(BO), %xmm9 movaps 4 * SIZE(BO), %xmm11 @@ -2115,7 +2115,7 @@ andq $-8, %rax salq $4, %rax je .L15 -.L1X: +.L1X: KERNEL1(32 * 0) KERNEL2(32 * 0) KERNEL3(32 * 0) @@ -2907,7 +2907,7 @@ decq I # i -- jg .L11 - ALIGN_4 + ALIGN_4 .L20: testq $2, M @@ -2932,7 +2932,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movaps 0 * SIZE(AO), %xmm8 movaps 16 * SIZE(AO), %xmm10 @@ -3467,8 +3467,8 @@ salq $1 + ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 - + ALIGN_4 + .L30: testq $1, M je .L39 @@ -3492,7 +3492,7 @@ movq KK, %rax salq $1 + ZBASE_SHIFT, %rax leaq (BO, %rax, 4), BO -#endif +#endif movsd 0 * SIZE(AO), %xmm8 movhps 2 * SIZE(AO), %xmm8 @@ -3948,7 +3948,7 @@ salq $ZBASE_SHIFT, %rax addq %rax, AORIG #endif - ALIGN_4 + ALIGN_4 .L39: #ifdef LN diff --git a/lapack-devel.log b/lapack-devel.log index 8243bb890..739e7aa92 100644 --- a/lapack-devel.log +++ b/lapack-devel.log @@ -6,14 +6,14 @@ Platform: BULLDOZER single thread --> LAPACK TESTING SUMMARY <-- Processing LAPACK Testing output found in the TESTING direcory -SUMMARY nb test run numerical error other error -================ =========== ================= ================ -REAL 1079349 0 (0.000%) 0 (0.000%) -DOUBLE PRECISION 1080161 0 (0.000%) 0 (0.000%) -COMPLEX 556022 0 (0.000%) 0 (0.000%) -COMPLEX16 556834 0 (0.000%) 0 (0.000%) +SUMMARY nb test run numerical error other error +================ =========== ================= ================ +REAL 1079349 0 (0.000%) 0 (0.000%) +DOUBLE PRECISION 1080161 0 (0.000%) 0 (0.000%) +COMPLEX 556022 0 (0.000%) 0 (0.000%) +COMPLEX16 556834 0 (0.000%) 0 (0.000%) ---> ALL PRECISIONS 3272366 0 (0.000%) 0 (0.000%) +--> ALL PRECISIONS 3272366 0 (0.000%) 0 (0.000%) ======================================================================================== diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c index fdc4eaef9..75c258bda 100644 --- a/lapack/getf2/getf2_k.c +++ b/lapack/getf2/getf2_k.c @@ -61,7 +61,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; - + if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; @@ -71,13 +71,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, info = 0; b = a; - + for (j = 0; j < n; j++) { len = MIN(j, m); for (i = 0; i < len; i++) { - ip = ipiv[i + offset] - 1 - offset; + ip = ipiv[i + offset] - 1 - offset; if (ip != i) { temp1 = *(b + i); temp2 = *(b + ip); @@ -85,7 +85,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, *(b + ip) = temp1; } } - + for (i = 1; i < len; i++) { b[i] -= DOTU_K(i, a + i, lda, b, 1); } diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c index ae8c6fd60..9bf47bcce 100644 --- a/lapack/getf2/zgetf2_k.c +++ b/lapack/getf2/zgetf2_k.c @@ -63,7 +63,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; - + if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; @@ -73,13 +73,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, info = 0; b = a; - + for (j = 0; j < n; j++) { len = MIN(j, m); for (i = 0; i < len; i++) { - ip = ipiv[i + offset] - 1 - offset; + ip = ipiv[i + offset] - 1 - offset; if (ip != i) { temp1 = *(b + i * 2 + 0); temp2 = *(b + i * 2 + 1); @@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, *(b + ip * 2 + 1) = temp2; } } - + ZTRSV_NLU(len, a, lda, b, 1, sb); if (j < m) { @@ -124,7 +124,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } if (j + 1 < m) { - SCAL_K(m - j - 1, 0, 0, temp3, temp4, + SCAL_K(m - j - 1, 0, 0, temp3, temp4, b + (j + 1) * 2, 1, NULL, 0, NULL, 0); } } else { diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 3dbc70e9d..a76be3ba7 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -44,7 +44,7 @@ static FLOAT dm1 = -1.; double sqrt(double); //In this case, the recursive getrf_parallel may overflow the stack. -//Instead, use malloc to alloc job_t. +//Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > GETRF_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif @@ -123,21 +123,21 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + if (0 && GEMM_UNROLL_N <= 8) { - LASWP_NCOPY(min_jj, off + 1, off + k, + LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sbb + k * (jjs - js) * COMPSIZE); } else { - LASWP_PLUS(min_jj, off + 1, off + k, ZERO, + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); - + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE); } @@ -145,13 +145,13 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; - + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb + k * is * COMPSIZE, - sbb + (jjs - js) * k * COMPSIZE, + sbb + (jjs - js) * k * COMPSIZE, c + (is + jjs * lda) * COMPSIZE, lda, is); } } @@ -161,9 +161,9 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra for (is = 0; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); - + GEMM_KERNEL_N(min_i, min_j, k, dm1, #ifdef COMPLEX ZERO, @@ -234,7 +234,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * c += range_m[0] * COMPSIZE; div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - + buffer[0] = sbb; @@ -243,10 +243,10 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * } for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { - + for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; - + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -254,43 +254,43 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if (0 && GEMM_UNROLL_N <= 8) { printf("helllo\n"); - LASWP_NCOPY(min_jj, off + 1, off + k, + LASWP_NCOPY(min_jj, off + 1, off + k, b + (- off + jjs * lda) * COMPSIZE, lda, ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); } else { - LASWP_PLUS(min_jj, off + 1, off + k, ZERO, + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif b + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); - - GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, + + GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); } for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; - + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb + k * is * COMPSIZE, - buffer[bufferside] + (jjs - xxx) * k * COMPSIZE, + buffer[bufferside] + (jjs - xxx) * k * COMPSIZE, b + (is + jjs * lda) * COMPSIZE, lda, is); } } - + for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } - + flag[mypos * CACHE_LINE_SIZE] = 0; - + if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; @@ -301,21 +301,21 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * min_i = m - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); } - + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); - + current = mypos; do { - + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + if ((current != mypos) && (!is)) { while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; } @@ -323,18 +323,18 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, is, xxx); - + if (is + min_i >= m) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } - + current ++; if (current >= args -> nthreads) current = 0; - + } while (current != mypos); } - + for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; @@ -382,7 +382,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -390,7 +390,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif m = args -> m; @@ -408,7 +408,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } if (m <= 0 || n <= 0) return 0; - + newarg.c = ipiv; newarg.lda = lda; @@ -428,14 +428,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, bk = mn; if (bk > next_bk) bk = next_bk; - + range_n_new[0] = offset; range_n_new[1] = offset + bk; - + iinfo = CNAME(args, NULL, range_n_new, sa, sb, 0); - + if (iinfo && !info) info = iinfo; - + #ifdef USE_ALLOC_HEAP job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); if(job==NULL){ @@ -449,24 +449,24 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); - + is = 0; num_cpu = 0; while (is < mn) { - + width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (width > mn - is - bk) width = mn - is - bk; if (width < bk) { next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1); - + if (next_bk > bk) next_bk = bk; width = next_bk; if (width > mn - is - bk) width = mn - is - bk; } - + if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]); mm = m - bk - is; @@ -479,7 +479,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.n = nn; newarg.k = bk; newarg.ldb = is + offset; - + nn -= width; range_n_mine[0] = 0; @@ -489,16 +489,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_M[0] = 0; num_cpu = 0; - + while (nn > 0){ - + if (mm >= nn) { width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (nn < width) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; - + width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (mm < width) width = mm; if (nn <= 0) width = mm; @@ -517,7 +517,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (mm <= 0) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; - + } queue[num_cpu].mode = mode; @@ -529,13 +529,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; flag[num_cpu * CACHE_LINE_SIZE] = 1; - + num_cpu ++; } - + newarg.nthreads = num_cpu; - + if (num_cpu > 0) { for (j = 0; j < num_cpu; j++) { for (i = 0; i < num_cpu; i++) { @@ -550,20 +550,20 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, bk = mn - is; if (bk > next_bk) bk = next_bk; - + range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; if (num_cpu > 0) { queue[num_cpu - 1].next = NULL; - + exec_blas_async(0, &queue[0]); - + inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); - + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); - + if (iinfo && !info) info = iinfo + is; for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; @@ -577,19 +577,19 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); if (iinfo && !info) info = iinfo + is; - + } - + } - + next_bk = init_bk; is = 0; - + while (is < mn) { - + bk = mn - is; if (bk > next_bk) bk = next_bk; - + width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (width > mn - is - bk) width = mn - is - bk; @@ -598,13 +598,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (next_bk > bk) next_bk = bk; } - blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, + blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); - + is += bk; } - + #ifdef USE_ALLOC_HEAP free(job); #endif @@ -638,7 +638,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -646,7 +646,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif m = args -> m; @@ -664,7 +664,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } if (m <= 0 || n <= 0) return 0; - + newarg.c = ipiv; newarg.lda = lda; newarg.common = NULL; @@ -700,9 +700,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_n_new[0] = offset; range_n_new[1] = offset + bk; - + info = CNAME(args, NULL, range_n_new, sa, sb, 0); - + TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); is = 0; @@ -714,7 +714,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, width = FORMULA1(m, n, is, bk, args -> nthreads); width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); - + if (width < bk) { next_bk = FORMULA2(m, n, is, bk, args -> nthreads); @@ -729,7 +729,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, width = next_bk; } - + if (width > mn - is - bk) { next_bk = mn - is - bk; width = next_bk; @@ -742,10 +742,10 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range[0] = 0; range[1] = width; - + num_cpu = 1; nn -= width; - + newarg.a = sb; newarg.b = a + (is + is * lda) * COMPSIZE; newarg.d = (void *)flag; @@ -753,16 +753,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.n = n - bk - is; newarg.k = bk; newarg.ldb = is + offset; - + while (nn > 0){ - + width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu); - + nn -= width; if (nn < 0) width = width + nn; - + range[num_cpu + 1] = range[num_cpu] + width; - + queue[num_cpu].mode = mode; //queue[num_cpu].routine = inner_advanced_thread; queue[num_cpu].routine = (void *)inner_basic_thread; @@ -776,21 +776,21 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, num_cpu ++; } - + queue[num_cpu - 1].next = NULL; is += bk; - + bk = n - is; if (bk > next_bk) bk = next_bk; - + range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; - + if (num_cpu > 1) { exec_blas_async(1, &queue[1]); - + #if 0 inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0); @@ -823,30 +823,30 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #endif for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; - + TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); - + } else { inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); - + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); } if (iinfo && !info) info = iinfo + is; - + } - + next_bk = init_bk; bk = init_bk; - + is = 0; - + while (is < mn) { - + bk = mn - is; if (bk > next_bk) bk = next_bk; - + width = FORMULA1(m, n, is, bk, args -> nthreads); width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); @@ -867,13 +867,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, width = next_bk; } - blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, + blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); - + is += bk; } - + return info; } diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index 6eda30a52..7e2319718 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -68,7 +68,7 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, c += range_n[0] * lda * COMPSIZE; d += range_n[0] * lda * COMPSIZE; } - + for (js = 0; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; @@ -76,32 +76,32 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + #if 0 - LASWP_NCOPY(min_jj, off + 1, off + k, + LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sb + k * (jjs - js) * COMPSIZE); #else - LASWP_PLUS(min_jj, off + 1, off + k, ZERO, + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); - + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE); #endif for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; - + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif (FLOAT *)args -> a + k * is * COMPSIZE, - sb + (jjs - js) * k * COMPSIZE, + sb + (jjs - js) * k * COMPSIZE, c + (is + jjs * lda) * COMPSIZE, lda, is); } } @@ -109,9 +109,9 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (is = 0; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); - + GEMM_KERNEL_N(min_i, min_j, k, dm1, #ifdef COMPLEX ZERO, @@ -141,7 +141,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -149,7 +149,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif m = args -> m; @@ -167,7 +167,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } if (m <= 0 || n <= 0) return 0; - + mn = MIN(m, n); blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); @@ -177,13 +177,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } - + sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; for (j = 0; j < mn; j += blocking) { - + jb = mn - j; if (jb > blocking) jb = blocking; @@ -198,9 +198,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (iinfo && !info) info = iinfo + j; if (j + jb < n) { - + TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); - + newarg.m = m - jb - j; newarg.n = n - jb - j; newarg.k = jb; @@ -215,7 +215,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.nthreads = args -> nthreads; gemm_thread_n(mode, &newarg, NULL, NULL, (void *)inner_thread, sa, sbb, args -> nthreads); - + } } @@ -226,7 +226,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, ZERO, #endif a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); - + } return info; diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index f1818ea97..e60a16c11 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -71,7 +71,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } if (m <= 0 || n <= 0) return 0; - + mn = MIN(m, n); blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); @@ -81,13 +81,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } - + sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; for (j = 0; j < mn; j += blocking) { - + jb = mn - j; if (jb > blocking) jb = blocking; @@ -102,53 +102,53 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (iinfo && !info) info = iinfo + j; if (j + jb < n) { - + TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); - + for (js = j + jb; js < n; js += REAL_GEMM_R){ jmin = n - js; if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R; - + for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){ min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - -#if 1 - LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, + +#if 1 + LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, #endif a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); - + GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE); #else - LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, + LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE); #endif - - + + for (jc = 0; jc < jb; jc += GEMM_P) { jcmin = jb - jc; if (jcmin > GEMM_P) jcmin = GEMM_P; - + TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1, #ifdef COMPLEX ZERO, #endif sb + jb * jc * COMPSIZE, - sbb + jb * (jjs - js) * COMPSIZE, + sbb + jb * (jjs - js) * COMPSIZE, a + (j + jc + jjs * lda) * COMPSIZE, lda, jc); } } for (is = j + jb; is < m; is += GEMM_P){ - + imin = m - is; if (imin > GEMM_P) imin = GEMM_P; GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa); - + GEMM_KERNEL_N(imin, jmin, jb, dm1, #ifdef COMPLEX ZERO, @@ -158,7 +158,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } } } - + for (j = 0; j < mn; j += jb) { jb = MIN(mn - j, blocking); LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, @@ -166,7 +166,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, ZERO, #endif a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); - + } return info; diff --git a/lapack/getrs/getrs_parallel.c b/lapack/getrs/getrs_parallel.c index 3a7e4260a..4b589fed3 100644 --- a/lapack/getrs/getrs_parallel.c +++ b/lapack/getrs/getrs_parallel.c @@ -51,14 +51,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #ifndef TRANS - LASWP_PLUS(n, 1, args -> m, ZERO, + LASWP_PLUS(n, 1, args -> m, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); TRSM_LNLU (args, range_m, range_n, sa, sb, 0); TRSM_LNUN (args, range_m, range_n, sa, sb, 0); #else TRSM_LTUN (args, range_m, range_n, sa, sb, 0); TRSM_LTLU (args, range_m, range_n, sa, sb, 0); - LASWP_MINUS(n, 1, args -> m, ZERO, + LASWP_MINUS(n, 1, args -> m, ZERO, (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); #endif @@ -81,7 +81,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); } @@ -97,7 +97,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); #else mode = BLAS_SINGLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); -#endif +#endif gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); } diff --git a/lapack/getrs/zgetrs_parallel.c b/lapack/getrs/zgetrs_parallel.c index b0d3fb0c2..d4abc49d2 100644 --- a/lapack/getrs/zgetrs_parallel.c +++ b/lapack/getrs/zgetrs_parallel.c @@ -104,7 +104,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); } diff --git a/lapack/getrs/zgetrs_single.c b/lapack/getrs/zgetrs_single.c index 3910d0e63..ee4ac81e5 100644 --- a/lapack/getrs/zgetrs_single.c +++ b/lapack/getrs/zgetrs_single.c @@ -45,11 +45,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); TRSM_LNLU(args, range_m, range_n, sa, sb, 0); - TRSM_LNUN(args, range_m, range_n, sa, sb, 0); + TRSM_LNUN(args, range_m, range_n, sa, sb, 0); #elif TRANS == 2 TRSM_LTUN(args, range_m, range_n, sa, sb, 0); TRSM_LTLU(args, range_m, range_n, sa, sb, 0); - + LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); #elif TRANS == 3 LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); diff --git a/lapack/laswp/generic/Makefile b/lapack/laswp/generic/Makefile index bc9ab80bd..8675e1a5f 100644 --- a/lapack/laswp/generic/Makefile +++ b/lapack/laswp/generic/Makefile @@ -12,7 +12,7 @@ ZLASWP = ../generic/zlaswp_k.c endif LASWP_DEPS = ../generic/laswp_k_1.c ../generic/laswp_k_2.c \ - ../generic/laswp_k_4.c ../generic/laswp_k_8.c + ../generic/laswp_k_4.c ../generic/laswp_k_8.c ZLASWP_DEPS = ../generic/zlaswp_k_1.c ../generic/zlaswp_k_2.c \ ../generic/zlaswp_k_4.c diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index 1b0db5f8c..88648cf29 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -45,7 +45,7 @@ #define a2 (a1 - 1) #endif -int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; @@ -53,7 +53,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG FLOAT *a1; FLOAT *b1, *b2; FLOAT A1, A2, B1, B2; - + a--; k1 --; @@ -64,7 +64,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif if (n <= 0) return 0; - + rows = k2-k1; if (rows <=0) return 0; if (rows == 1) { @@ -72,7 +72,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; - + if(a1 == b1) return 0; for(j=0; j> 1); i--; @@ -136,22 +136,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG A2 = *a2; B1 = *b1; B2 = *b2; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -168,11 +168,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -184,10 +184,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b2 = A2; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 2; #else @@ -205,12 +205,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -227,11 +227,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -249,26 +249,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 -= 2; #endif - + //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv; - b1 = a + ip1; + b1 = a + ip1; A1 = *a1; B1 = *b1; *a1 = B1; *b1 = A1; } - + a += lda; - + j --; } while (j > 0); } return 0; -} +} diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 8a8a89bd1..93b9a2c01 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -47,7 +47,7 @@ #define a4 (a3 - 1) #endif -int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; - + a--; k1 --; @@ -66,7 +66,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif if (n <= 0) return 0; - + j = (n >> 1); rows = k2-k1; if (rows <=0) return 0; @@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; - + if(a1 == b1) return 0; for(j=0; j 0) { do { piv = ipiv; - + #ifndef MINUS a1 = a + k1 + 1; #else a1 = a + k2; #endif - + a3 = a1 + 1 * lda; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + i = ((rows) >> 1); - + // Loop pipeline i--; @@ -137,31 +137,31 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG B2 = *b2; B3 = *b3; B4 = *b4; - + A1 = *a1; A2 = *a2; A3 = *a3; A4 = *a4; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -186,13 +186,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = A4; *a4 = B3; *b3 = A3; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -211,13 +211,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b4 = A4; } } - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + #ifndef MINUS a1 += 2; a3 += 2; @@ -233,7 +233,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG B2 = *b2; B3 = *b3; B4 = *b4; - + A1 = *a1; A2 = *a2; A3 = *a3; @@ -245,14 +245,14 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a2 = A1; *a3 = A4; *a4 = A3; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -277,13 +277,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = A4; *a4 = B3; *b3 = A3; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -310,9 +310,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif - //Remain + //Remain i = ((rows) & 1); - + if (i > 0) { ip1 = *piv; b1 = a + ip1; @@ -328,7 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = B3; *b3 = A3; } - + a += 2 * lda; j --; } while (j > 0); @@ -342,15 +342,15 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 = a + k2; #endif - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + i = ((rows) >> 1); i --; @@ -359,22 +359,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG A2 = *a2; B1 = *b1; B2 = *b2; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -391,11 +391,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -407,10 +407,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b2 = A2; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 2; #else @@ -418,7 +418,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i --; } - + //Loop Ending (n=1) A1 = *a1; A2 = *a2; @@ -428,12 +428,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -450,11 +450,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -472,13 +472,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 -= 2; #endif - + //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv; - b1 = a + ip1; + b1 = a + ip1; A1 = *a1; B1 = *b1; @@ -488,5 +488,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG } return 0; -} +} diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index 86ee949c4..5cb65170a 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -51,7 +51,7 @@ #define a8 (a7 - 1) #endif -int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; @@ -61,7 +61,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG FLOAT *b5, *b6, *b7, *b8; FLOAT A1, A2, B1, B2, A3, A4, B3, B4; FLOAT A5, A6, B5, B6, A7, A8, B7, B8; - + a--; k1 --; @@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; - + if(a1 == b1) return 0; for(j=0; j> 1); - + i--; //Loop pipeline //Main Loop while (i > 0) { @@ -147,12 +147,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG B6 = *b6; B7 = *b7; B8 = *b8; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; @@ -163,7 +163,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a6 = A5; *a7 = A8; *a8 = A7; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; @@ -174,7 +174,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a8 = B8; *b8 = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -215,7 +215,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a7 = A8; *a8 = B7; *b7 = A7; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; @@ -225,7 +225,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b5 = A5; *a7 = B7; *b7 = A7; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -258,17 +258,17 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b8 = A8; } } - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - + #ifndef MINUS a1 += 2; a3 += 2; @@ -312,7 +312,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a6 = A5; *a7 = A8; *a8 = A7; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; @@ -323,7 +323,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a8 = B8; *b8 = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -364,7 +364,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a7 = A8; *a8 = B7; *b7 = A7; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; @@ -374,7 +374,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b5 = A5; *a7 = B7; *b7 = A7; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -420,9 +420,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a7 -= 2; #endif - //Remain + //Remain i = ((rows) & 1); - + if (i > 0) { ip1 = *piv; b1 = a + ip1; @@ -449,9 +449,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a7 = B7; *b7 = A7; } - + a += 4 * lda; - + j --; } while (j > 0); } @@ -464,20 +464,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 = a + k2; #endif - + a3 = a1 + 1 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + i = ((rows) >> 1); i--; @@ -486,31 +486,31 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG A2 = *a2; A3 = *a3; A4 = *a4; - + B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -535,13 +535,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = A4; *a4 = B3; *b3 = A3; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -560,13 +560,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b4 = A4; } } - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + #ifndef MINUS a1 += 2; a3 += 2; @@ -576,13 +576,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i --; } - + //Loop Ending B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; - + A1 = *a1; A2 = *a2; A3 = *a3; @@ -594,14 +594,14 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a2 = A1; *a3 = A4; *a4 = A3; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -626,13 +626,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = A4; *a4 = B3; *b3 = A3; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -660,7 +660,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i = ((rows) & 1); - + if (i > 0) { ip1 = *piv; b1 = a + ip1; @@ -675,7 +675,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = B3; *b3 = A3; } - + a += 2 * lda; } @@ -687,15 +687,15 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 = a + k2; #endif - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + i = ((rows) >> 1); i --; @@ -704,22 +704,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG A2 = *a2; B1 = *b1; B2 = *b2; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -736,11 +736,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -752,10 +752,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b2 = A2; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 2; #else @@ -763,7 +763,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i --; } - + //Loop Ending (n=1) A1 = *a1; A2 = *a2; @@ -773,12 +773,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -795,11 +795,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -817,13 +817,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 -= 2; #endif - + //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv; - b1 = a + ip1; + b1 = a + ip1; A1 = *a1; B1 = *b1; @@ -833,5 +833,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG } return 0; -} +} diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index e3a05dbcc..947941839 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -59,7 +59,7 @@ #define a16 (a15 - 1) #endif -int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; @@ -74,7 +74,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG FLOAT A5, A6, B5, B6, A7, A8, B7, B8; FLOAT A9, A10, B9, B10, A11, A12, B11, B12; FLOAT A13, A14, B13, B14, A15, A16, B15, B16; - + a--; k1 --; @@ -93,7 +93,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG ip1 = *ipiv; a1 = a + k1 + 1; b1 = a + ip1; - + if(a1 == b1) return 0; for(j=0; j 0) { ip1 = *piv; b1 = a + ip1; @@ -697,7 +697,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b5 = b1 + 2 * lda; b7 = b1 + 3 * lda; - + b9 = b1 + 4 * lda; b11 = b1 + 5 * lda; b13 = b1 + 6 * lda; @@ -740,9 +740,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a15 = B15; *b15 = A15; } - + a += 8 * lda; - + j --; } while (j > 0); } @@ -755,19 +755,19 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 = a + k2; #endif - + a3 = a1 + 1 * lda; a5 = a1 + 2 * lda; a7 = a1 + 3 * lda; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; @@ -787,7 +787,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG A6 = *a6; A7 = *a7; A8 = *a8; - + B1 = *b1; B2 = *b2; B3 = *b3; @@ -796,12 +796,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG B6 = *b6; B7 = *b7; B8 = *b8; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; @@ -812,7 +812,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a6 = A5; *a7 = A8; *a8 = A7; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; @@ -823,7 +823,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a8 = B8; *b8 = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -864,7 +864,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a7 = A8; *a8 = B7; *b7 = A7; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; @@ -874,7 +874,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b5 = A5; *a7 = B7; *b7 = A7; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -907,17 +907,17 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b8 = A8; } } - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; b5 = b1 + 2 * lda; b6 = b2 + 2 * lda; b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - + #ifndef MINUS a1 += 2; a3 += 2; @@ -959,7 +959,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a6 = A5; *a7 = A8; *a8 = A7; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; @@ -970,7 +970,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a8 = B8; *b8 = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -1011,7 +1011,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a7 = A8; *a8 = B7; *b7 = A7; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; @@ -1021,7 +1021,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b5 = A5; *a7 = B7; *b7 = A7; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -1068,7 +1068,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i = (rows & 1); - + if (i > 0) { ip1 = *piv; b1 = a + ip1; @@ -1094,7 +1094,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a7 = B7; *b7 = A7; } - + a += 4 * lda; } @@ -1106,20 +1106,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 = a + k2; #endif - + a3 = a1 + 1 * lda; ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + i = ((rows) >> 1); i--; @@ -1128,31 +1128,31 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG A2 = *a2; A3 = *a3; A4 = *a4; - + B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; *a3 = A4; *a4 = A3; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -1177,13 +1177,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = A4; *a4 = B3; *b3 = A3; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -1202,13 +1202,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b4 = A4; } } - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - + #ifndef MINUS a1 += 2; a3 += 2; @@ -1218,13 +1218,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i --; } - + //Loop Ending B1 = *b1; B2 = *b2; B3 = *b3; B4 = *b4; - + A1 = *a1; A2 = *a2; A3 = *a3; @@ -1236,14 +1236,14 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a2 = A1; *a3 = A4; *a4 = A3; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; *a4 = B4; *b4 = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -1268,13 +1268,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = A4; *a4 = B3; *b3 = A3; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; *a3 = B3; *b3 = A3; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -1302,7 +1302,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i = ((rows) & 1); - + if (i > 0) { ip1 = *piv; b1 = a + ip1; @@ -1317,7 +1317,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a3 = B3; *b3 = A3; } - + a += 2 * lda; } @@ -1329,15 +1329,15 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 = a + k2; #endif - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + i = ((rows) >> 1); i --; @@ -1346,22 +1346,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG A2 = *a2; B1 = *b1; B2 = *b2; - + ip1 = *piv; piv += incx; ip2 = *piv; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -1378,11 +1378,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -1394,10 +1394,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *b2 = A2; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 2; #else @@ -1405,7 +1405,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #endif i --; } - + //Loop Ending (n=1) A1 = *a1; A2 = *a2; @@ -1415,12 +1415,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (b2 == a1) { *a1 = A2; *a2 = A1; - } else + } else if (b2 != a2) { *a2 = B2; *b2 = A2; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -1437,11 +1437,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG *a1 = A2; *a2 = B1; *b1 = A1; - } else + } else if (b2 == a2) { *a1 = B1; *b1 = A1; - } else + } else if (b2 == b1) { *a1 = B1; *a2 = A1; @@ -1459,13 +1459,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG #else a1 -= 2; #endif - + //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv; - b1 = a + ip1; + b1 = a + ip1; A1 = *a1; B1 = *b1; @@ -1475,5 +1475,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG } return 0; -} +} diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index 7a62dd9b8..d1204778a 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -45,8 +45,8 @@ #define a2 (a1 - 2) #endif -int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, - FLOAT *a, BLASLONG lda, +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; @@ -79,7 +79,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif b1 = a + ip1; - + if(a1 == b1) return 0; for(j=0; j> 1); i --; //Loop pipeline @@ -152,26 +152,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -196,13 +196,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -221,10 +221,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1) = A4; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 4; #else @@ -243,22 +243,22 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - - + + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -283,13 +283,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -308,8 +308,8 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1) = A4; } } - - + + #ifndef MINUS a1 += 4; #else @@ -318,7 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; @@ -332,13 +332,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b1 + 0) = A1; *(b1 + 1) = A2; } - + a += lda; - + j --; } while (j > 0); } return 0; -} +} diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index 0fa685859..c18ab4bee 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -45,8 +45,8 @@ #define a2 (a1 - 2) #endif -int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, - FLOAT *a, BLASLONG lda, +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; @@ -81,7 +81,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif b1 = a + ip1; - + if(a1 == b1) return 0; for(j=0; j> 1); i--; @@ -154,7 +154,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + B5 = *(b1 + 0 + lda); B6 = *(b1 + 1 + lda); B7 = *(b2 + 0 + lda); @@ -164,7 +164,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, piv += incx; ip2 = *piv * 2; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; @@ -175,7 +175,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; @@ -186,7 +186,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 0 + lda) = A7; *(b2 + 1 + lda) = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -227,7 +227,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -237,7 +237,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a1 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -270,10 +270,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1 + lda) = A8; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 4; #else @@ -296,7 +296,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + B5 = *(b1 + 0 + lda); B6 = *(b1 + 1 + lda); B7 = *(b2 + 0 + lda); @@ -312,7 +312,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a1 + 1 + lda) = A8; *(a2 + 0 + lda) = A5; *(a2 + 1 + lda) = A6; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; @@ -323,7 +323,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 0 + lda) = A7; *(b2 + 1 + lda) = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -364,7 +364,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -374,7 +374,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a1 + 1 + lda) = B6; *(b1 + 0 + lda) = A5; *(b1 + 1 + lda) = A6; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -407,9 +407,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1 + lda) = A8; } } - - + + #ifndef MINUS a1 += 4; #else @@ -418,7 +418,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; @@ -440,30 +440,30 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b1 + 0 + lda) = A3; *(b1 + 1 + lda) = A4; } - + a += 2 * lda; - + j --; } while (j > 0); } if (n & 1) { piv = ipiv; - + #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif - + ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + i = (rows >> 1); i--; @@ -478,26 +478,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -522,13 +522,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -547,10 +547,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1) = A4; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 4; #else @@ -567,21 +567,21 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -606,13 +606,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -631,16 +631,16 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1) = A4; } } - + #ifndef MINUS a1 += 4; #else a1 -= 4; #endif - + //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; @@ -657,5 +657,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, } return 0; -} +} diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index c63a8e2e0..45e1bf01e 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -51,8 +51,8 @@ #define a8 (a7 - 2) #endif -int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, - FLOAT *a, BLASLONG lda, +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ BLASLONG i, j, ip1, ip2, rows; @@ -89,7 +89,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif b1 = a + ip1; - + if(a1 == b1) return 0; for(j=0; j 0) { ip1 = *piv * 2; b1 = a + ip1; @@ -688,9 +688,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b7 + 0) = A7; *(b7 + 1) = A8; } - + a += 4 * lda; - + j --; } while (j > 0); } @@ -705,18 +705,18 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif a3 = a1 + lda; - + ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + lda; b4 = b2 + lda; - + i = (rows >> 1); i--; @@ -727,7 +727,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); - + A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); @@ -737,17 +737,17 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); - + ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; @@ -758,7 +758,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; @@ -769,7 +769,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b4 + 0) = A7; *(b4 + 1) = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -810,7 +810,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a4 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -820,7 +820,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -853,13 +853,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b4 + 1) = A8; } } - + b1 = a + ip1; b2 = a + ip2; - + b3 = b1 + lda; b4 = b2 + lda; - + #ifndef MINUS a1 += 4; a3 += 4; @@ -874,7 +874,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, A2 = *(a1 + 1); A3 = *(a2 + 0); A4 = *(a2 + 1); - + A5 = *(a3 + 0); A6 = *(a3 + 1); A7 = *(a4 + 0); @@ -884,13 +884,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + B5 = *(b3 + 0); B6 = *(b3 + 1); B7 = *(b4 + 0); B8 = *(b4 + 1); - - + + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; @@ -901,7 +901,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a3 + 1) = A8; *(a4 + 0) = A5; *(a4 + 1) = A6; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; @@ -912,7 +912,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b4 + 0) = A7; *(b4 + 1) = A8; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -953,7 +953,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a4 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -963,7 +963,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -996,7 +996,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b4 + 1) = A8; } } - + #ifndef MINUS a1 += 4; a3 += 4; @@ -1007,7 +1007,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv * 2; @@ -1031,28 +1031,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b3 + 0) = A3; *(b3 + 1) = A4; } - + a += 2 * lda; - + } if (n & 1) { piv = ipiv; - + #ifndef MINUS a1 = a + (k1 + 1) * 2; #else a1 = a + k2 * 2; #endif - + ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; - + b1 = a + ip1; b2 = a + ip2; - + i = (rows >> 1); i--; @@ -1067,26 +1067,26 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + ip1 = *piv * 2; piv += incx; ip2 = *piv * 2; piv += incx; - + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -1111,13 +1111,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -1136,10 +1136,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1) = A4; } } - + b1 = a + ip1; b2 = a + ip2; - + #ifndef MINUS a1 += 4; #else @@ -1156,21 +1156,21 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, B2 = *(b1 + 1); B3 = *(b2 + 0); B4 = *(b2 + 1); - + if (b1 == a1) { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; *(a2 + 0) = A1; *(a2 + 1) = A2; - } else + } else if (b2 != a2) { *(a2 + 0) = B3; *(a2 + 1) = B4; *(b2 + 0) = A3; *(b2 + 1) = A4; } - } else + } else if (b1 == a2) { if (b2 != a1) { if (b2 == a2) { @@ -1195,13 +1195,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(a2 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == a2) { *(a1 + 0) = B1; *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - } else + } else if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; @@ -1220,16 +1220,16 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, *(b2 + 1) = A4; } } - + #ifndef MINUS a1 += 4; #else a1 -= 4; #endif - + //Remain i = (rows & 1); - + if (i > 0) { ip1 = *piv * 2; b1 = a + ip1; @@ -1246,5 +1246,5 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, } return 0; -} +} diff --git a/lapack/lauu2/lauu2_L.c b/lapack/lauu2/lauu2_L.c index aedb966ff..ccb299ee4 100644 --- a/lapack/lauu2/lauu2_L.c +++ b/lapack/lauu2/lauu2_L.c @@ -52,7 +52,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -61,13 +61,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i, lda, NULL, 0, NULL, 0); - + if (i < n - 1) { aii = DOTU_K(n - i - 1, a + i + 1 + i * lda, 1, a + i + 1 + i * lda, 1); - + *(a + i + i * lda) += aii; - - GEMV_T(n - i - 1, i, 0, dp1, + + GEMV_T(n - i - 1, i, 0, dp1, a + (i + 1) , lda, a + (i + 1) + i * lda, 1, a + i , lda, sb); diff --git a/lapack/lauu2/lauu2_U.c b/lapack/lauu2/lauu2_U.c index f9a718641..c097c81d0 100644 --- a/lapack/lauu2/lauu2_U.c +++ b/lapack/lauu2/lauu2_U.c @@ -52,7 +52,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -61,13 +61,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i * lda, 1, NULL, 0, NULL, 0); - + if (i < n - 1) { aii = DOTU_K(n - i - 1, a + i + (i + 1)* lda, lda, a + i + (i + 1) * lda, lda); - + *(a + i + i * lda) += aii; - - GEMV_N(i, n - i - 1, 0, dp1, + + GEMV_N(i, n - i - 1, 0, dp1, a + (i + 1) * lda, lda, a + i + (i + 1) * lda, lda, a + i * lda, 1, sb); diff --git a/lapack/lauu2/zlauu2_L.c b/lapack/lauu2/zlauu2_L.c index 8a892d974..84baeca67 100644 --- a/lapack/lauu2/zlauu2_L.c +++ b/lapack/lauu2/zlauu2_L.c @@ -52,7 +52,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -62,16 +62,16 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO, a + i * COMPSIZE, lda, NULL, 0, NULL, 0); - + if (i < n - 1) { temp[0] = DOTC_K(n - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, a + (i + 1 + i * lda) * COMPSIZE, 1); GET_IMAGE(temp[1]); - + *(a + (i + i * lda) * COMPSIZE + 0) += temp[0]; *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; - + GEMV_U(n - i - 1, i, 0, dp1, ZERO, a + ((i + 1) ) * COMPSIZE, lda, a + ((i + 1) + i * lda) * COMPSIZE, 1, diff --git a/lapack/lauu2/zlauu2_U.c b/lapack/lauu2/zlauu2_U.c index b20ea994a..fd0a15f1a 100644 --- a/lapack/lauu2/zlauu2_U.c +++ b/lapack/lauu2/zlauu2_U.c @@ -52,24 +52,24 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (i = 0; i < n; i++) { - SCAL_K(i + 1, 0, 0, + SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO, a + i * lda * COMPSIZE, 1, NULL, 0, NULL, 0); - + if (i < n - 1) { temp[0] = DOTC_K(n - i - 1, a + (i + (i + 1) * lda) * COMPSIZE, lda, a + (i + (i + 1) * lda) * COMPSIZE, lda); GET_IMAGE(temp[1]); - + *(a + (i + i * lda) * COMPSIZE + 0) += temp[0]; *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; - + GEMV_O(i, n - i - 1, 0, dp1, ZERO, a + ( (i + 1) * lda) * COMPSIZE, lda, a + (i + (i + 1) * lda) * COMPSIZE, lda, diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c index 8d9cde9f7..c93c4a861 100644 --- a/lapack/lauum/lauum_L_parallel.c +++ b/lapack/lauum/lauum_L_parallel.c @@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -62,11 +62,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif if (args -> nthreads == 1) { - LAUUM_L_SINGLE(args, NULL, NULL, sa, sb, 0); + LAUUM_L_SINGLE(args, NULL, NULL, sa, sb, 0); return 0; } @@ -87,7 +87,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; - + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; @@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, bk = n - i; if (bk > blocking) bk = blocking; - + newarg.n = i; newarg.k = bk; newarg.a = a + i * COMPSIZE; @@ -118,6 +118,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, CNAME(&newarg, NULL, NULL, sa, sb, 0); } - + return 0; } diff --git a/lapack/lauum/lauum_L_single.c b/lapack/lauum/lauum_L_single.c index 65e8f0446..dead857fb 100644 --- a/lapack/lauum/lauum_L_single.c +++ b/lapack/lauum/lauum_L_single.c @@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -107,11 +107,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (j = 0; j < n; j += blocking) { bk = MIN(blocking, n - j); - + if (j > 0 ){ TRMM_ILNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb); - + for (ls = 0; ls < j; ls += REAL_GEMM_R) { min_l = j - ls; if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; @@ -127,97 +127,97 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } else { aa = sb2; } - + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; - + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); - - SYRK_KERNEL(min_i, min_jj, bk, dp1, - aa, - sb2 + (jjs - ls) * bk * COMPSIZE, - a + (ls + jjs * lda) * COMPSIZE, lda, + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + aa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (ls + jjs * lda) * COMPSIZE, lda, ls - jjs); } - + for(is = ls + min_i; is < j ; is += GEMM_P){ min_i = j - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); - - SYRK_KERNEL(min_i, min_l, bk, dp1, - sa, - sb2, - a + (is + ls * lda) * COMPSIZE, lda, + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, is - ls); } - + for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; - + TRMM_KERNEL(min_k, min_l, bk, dp1, #ifdef COMPLEX ZERO, #endif sb + ks * bk * COMPSIZE, sb2, - a + (ks + j + ls * lda) * COMPSIZE, lda, ks); + a + (ks + j + ls * lda) * COMPSIZE, lda, ks); } #else min_i = j - ls; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa); - + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; - + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); - - SYRK_KERNEL(min_i, min_jj, bk, dp1, - sa, - sb2 + (jjs - ls) * bk * COMPSIZE, - a + (ls + jjs * lda) * COMPSIZE, lda, + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + sa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (ls + jjs * lda) * COMPSIZE, lda, ls - jjs); } - + for(is = ls + min_i; is < j ; is += GEMM_P){ min_i = j - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); - - SYRK_KERNEL(min_i, min_l, bk, dp1, - sa, - sb2, - a + (is + ls * lda) * COMPSIZE, lda, + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, is - ls); } for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; - + TRMM_KERNEL(min_k, min_l, bk, dp1, #ifdef COMPLEX ZERO, #endif sb + ks * bk * COMPSIZE, sb2, - a + (ks + j + ls * lda) * COMPSIZE, lda, ks); + a + (ks + j + ls * lda) * COMPSIZE, lda, ks); } #endif } } - + if (!range_n) { range_N[0] = j; range_N[1] = j + bk; @@ -225,9 +225,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } - + CNAME(args, NULL, range_N, sa, sb, 0); - + } return 0; diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c index d68d12bd3..e4a2792b2 100644 --- a/lapack/lauum/lauum_U_parallel.c +++ b/lapack/lauum/lauum_U_parallel.c @@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -62,11 +62,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif if (args -> nthreads == 1) { - LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); + LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); return 0; } @@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, bk = n - i; if (bk > blocking) bk = blocking; - + newarg.n = i; newarg.k = bk; newarg.a = a + ( i * lda) * COMPSIZE; @@ -118,6 +118,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, CNAME(&newarg, NULL, NULL, sa, sb, 0); } - + return 0; } diff --git a/lapack/lauum/lauum_U_single.c b/lapack/lauum/lauum_U_single.c index 14cf0ad2b..1ce62c0dc 100644 --- a/lapack/lauum/lauum_U_single.c +++ b/lapack/lauum/lauum_U_single.c @@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -117,74 +117,74 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, min_l = j - ls; #if 0 - - + + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; min_i = ls + min_l; if (min_i > GEMM_P) min_i = GEMM_P; - + if (ls > 0) { GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa); aa = sa; } else { aa = sb2; } - + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; - + GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); - - SYRK_KERNEL(min_i, min_jj, bk, dp1, - aa, - sb2 + (jjs - ls) * bk * COMPSIZE, + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + aa, + sb2 + (jjs - ls) * bk * COMPSIZE, a + (jjs * lda) * COMPSIZE, lda, - jjs); } - + if (ls + REAL_GEMM_R >= j ) { for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; - + TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif aa, sb + ks * bk * COMPSIZE, - a + ((ks + j) * lda) * COMPSIZE, lda, -ks); + a + ((ks + j) * lda) * COMPSIZE, lda, -ks); } } - + for(is = min_i; is < ls + min_l ; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; - + if (is < ls) { GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); aa = sa; } else { aa = sb2 + (is - ls) * bk * COMPSIZE; } - - SYRK_KERNEL(min_i, min_l, bk, dp1, - aa, - sb2, + + SYRK_KERNEL(min_i, min_l, bk, dp1, + aa, + sb2, a + (is + ls * lda) * COMPSIZE, lda, is - ls); - + if (ls + REAL_GEMM_R >= j ) { for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; - + TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif aa, sb + ks * bk * COMPSIZE, - a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); + a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); } } } @@ -198,12 +198,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ min_jj = ls + min_l - jjs; if (min_jj > GEMM_P) min_jj = GEMM_P; - + GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); - - SYRK_KERNEL(min_i, min_jj, bk, dp1, - sa, - sb2 + (jjs - ls) * bk * COMPSIZE, + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + sa, + sb2 + (jjs - ls) * bk * COMPSIZE, a + (jjs * lda) * COMPSIZE, lda, - jjs); } @@ -211,40 +211,40 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; - + TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + ks * bk * COMPSIZE, - a + ((ks + j) * lda) * COMPSIZE, lda, -ks); + a + ((ks + j) * lda) * COMPSIZE, lda, -ks); } } for(is = min_i; is < ls + min_l ; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; - + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); - - SYRK_KERNEL(min_i, min_l, bk, dp1, - sa, - sb2, + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, a + (is + ls * lda) * COMPSIZE, lda, is - ls); - + if (ls + REAL_GEMM_R >= j ) { for (ks = 0; ks < bk; ks += GEMM_P) { min_k = bk - ks; if (min_k > GEMM_P) min_k = GEMM_P; - + TRMM_KERNEL(min_i, min_k, bk, dp1, #ifdef COMPLEX ZERO, #endif sa, sb + ks * bk * COMPSIZE, - a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); + a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); } } } @@ -259,7 +259,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } - + CNAME(args, NULL, range_N, sa, sb, 0); } diff --git a/lapack/potf2/potf2_L.c b/lapack/potf2/potf2_L.c index 23aa97c51..8cd094ac2 100644 --- a/lapack/potf2/potf2_L.c +++ b/lapack/potf2/potf2_L.c @@ -59,7 +59,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -81,11 +81,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, i = n - j - 1; if (i > 0) { - GEMV_N(i, j, 0, dm1, + GEMV_N(i, j, 0, dm1, a + j + 1, lda, a + j, lda, aoffset + j + 1, 1, sb); - + SCAL_K(i, 0, 0, dp1 / ajj, aoffset + j + 1, 1, NULL, 0, NULL, 0); } diff --git a/lapack/potf2/potf2_U.c b/lapack/potf2/potf2_U.c index 755bf8d51..9f908c14c 100644 --- a/lapack/potf2/potf2_U.c +++ b/lapack/potf2/potf2_U.c @@ -58,7 +58,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -78,11 +78,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, i = n - j - 1; if (i > 0) { - GEMV_T(j, i, 0, dm1, + GEMV_T(j, i, 0, dm1, a + lda, lda, a, 1, a + j + lda, lda, sb); - + SCAL_K(i, 0, 0, dp1 / ajj, a + j + lda, lda, NULL, 0, NULL, 0); } diff --git a/lapack/potf2/zpotf2_L.c b/lapack/potf2/zpotf2_L.c index 8ce0d4e07..33e9b6044 100644 --- a/lapack/potf2/zpotf2_L.c +++ b/lapack/potf2/zpotf2_L.c @@ -58,7 +58,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -89,7 +89,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, a + (j + 1) * 2, lda, a + j * 2, lda, aoffset + (j + 1) * 2, 1, sb); - + SCAL_K(i, 0, 0, ONE / ajj[0], ZERO, aoffset + (j + 1) * 2, 1, NULL, 0, NULL, 0); } diff --git a/lapack/potf2/zpotf2_U.c b/lapack/potf2/zpotf2_U.c index c1f5156aa..e0ccd461c 100644 --- a/lapack/potf2/zpotf2_U.c +++ b/lapack/potf2/zpotf2_U.c @@ -57,7 +57,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -68,7 +68,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, ajj[0] = DOTC_K(j, a, 1, a, 1); GET_IMAGE(ajj[1]); - ajj[0] = *(a + j * 2) - ajj[0]; + ajj[0] = *(a + j * 2) - ajj[0]; if (ajj[0] <= 0){ *(a + j * 2 + 0) = ajj[0]; @@ -87,7 +87,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, a + lda * 2, lda, a, 1, a + (j + lda) * 2, lda, sb); - + SCAL_K(i, 0, 0, ONE / ajj[0], ZERO, a + (j + lda) * 2, lda, NULL, 0, NULL, 0); } diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 1ebcad82f..52a383a15 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -55,7 +55,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -63,11 +63,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif if (args -> nthreads == 1) { - info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); return info; } @@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; - + for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; @@ -108,15 +108,15 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + bk + i * lda) * COMPSIZE; - + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); - + newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + (i + bk + i * lda) * COMPSIZE; newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; - + #ifndef USE_SIMPLE_THREADED_LEVEL3 HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else diff --git a/lapack/potrf/potrf_L_single.c b/lapack/potrf/potrf_L_single.c index d6d143623..0edadf321 100644 --- a/lapack/potrf/potrf_L_single.c +++ b/lapack/potrf/potrf_L_single.c @@ -100,7 +100,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -129,7 +129,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (info) return info + j; if (n - j - bk > 0) { - + TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); /* First tile */ @@ -147,9 +147,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } else { aa = sa; } - + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa); - + TRSM_KERNEL(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, @@ -157,7 +157,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, aa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); - + SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, @@ -172,7 +172,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef COMPLEX ZERO, #endif - + sa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); @@ -188,17 +188,17 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, is - j - bk); #endif } - + for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){ min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2); - + for (is = js; is < n; is += GEMM_P) { min_i = n - is; if (min_i > GEMM_P) min_i = GEMM_P; - + #ifdef SHARED_ARRAY if (is + min_i < js + min_j) { @@ -207,7 +207,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); aa = sa; } - + SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, @@ -217,7 +217,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #else GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); - + SYRK_KERNEL_L(min_i, min_j, bk, dm1, sa, sb2, @@ -229,7 +229,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, } } - + } return 0; diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index 31da14101..d9b7a8818 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -55,7 +55,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -63,11 +63,11 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif if (args -> nthreads == 1) { - info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); return info; } @@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; - + for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; @@ -108,15 +108,15 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.n = n - i - bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; - + gemm_thread_n(mode | BLAS_TRANSA_T, &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); - + newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - + #ifndef USE_SIMPLE_THREADED_LEVEL3 HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else diff --git a/lapack/potrf/potrf_U_single.c b/lapack/potrf/potrf_U_single.c index aa445c527..7bdeb494d 100644 --- a/lapack/potrf/potrf_U_single.c +++ b/lapack/potrf/potrf_U_single.c @@ -91,7 +91,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #ifdef SHARED_ARRAY FLOAT *aa; #endif - + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); @@ -109,14 +109,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, info = POTF2_U(args, NULL, range_n, sa, sb, 0); return info; } - + blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; - + for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; - + if (!range_n) { range_N[0] = j; range_N[1] = j + bk; @@ -124,29 +124,29 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } - + info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; - + if (n - j - bk > 0) { - + TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); - + for(js = j + bk; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; - + for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); - + for (is = 0; is < bk; is += GEMM_P) { min_i = bk - is; if (min_i > GEMM_P) min_i = GEMM_P; - - TRSM_KERNEL (min_i, min_jj, bk, dm1, + + TRSM_KERNEL (min_i, min_jj, bk, dm1, #ifdef COMPLEX ZERO, #endif @@ -158,14 +158,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (is = j + bk; is < js + min_j; is += min_i) { min_i = js + min_j - is; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + #ifdef SHARED_ARRAY if ((is >= js) && (is + min_i <= js + min_j)) { aa = sb2 + bk * (is - js) * COMPSIZE; @@ -176,18 +176,18 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #else GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); #endif - + SYRK_KERNEL_U(min_i, min_j, bk, - dm1, + dm1, SA, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); - + } } } - + } - + return 0; } diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index 11f7f533c..c3a7ced2c 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -42,7 +42,7 @@ #ifndef USE_SIMPLE_THREADED_LEVEL3 //The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. +//Instead, use malloc to alloc job_t. #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #define USE_ALLOC_HEAP #endif @@ -189,19 +189,19 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; } - + #ifndef LOWER TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); #else TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); #endif - + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { - + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ - + min_jj = MIN(m_to, xxx + div_n) - jjs; - + #ifndef LOWER if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; #else @@ -211,7 +211,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #ifndef LOWER OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - TRSM_KERNEL (k, min_jj, k, dm1, + TRSM_KERNEL (k, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif @@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, a + jjs * COMPSIZE, lda, 0); #endif } - + #ifndef LOWER for (i = 0; i <= mypos; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; @@ -238,25 +238,25 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for (i = mypos; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #endif - + WMB; } - + min_i = m_to - m_from; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #endif - + current = mypos; #ifndef LOWER @@ -266,47 +266,47 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif { div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + /* thread has to wait */ if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, m_from, xxx); - + if (m_from + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } - + #ifndef LOWER current ++; #else current --; #endif } - + for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; - + if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else + } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } - + #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #endif - + current = mypos; - + #ifndef LOWER while (current < args -> nthreads) #else @@ -314,18 +314,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif { div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); - + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, is, xxx); - + if (is + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } - } + } #ifndef LOWER current ++; #else @@ -333,7 +333,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif } } - + for (i = 0; i < args -> nthreads; i++) { if (i != mypos) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { @@ -341,7 +341,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } } - + return 0; } @@ -378,7 +378,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -389,7 +389,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #else mode = BLAS_SINGLE | BLAS_COMPLEX; mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; -#endif +#endif #endif newarg.m = args -> m; @@ -409,7 +409,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #endif newarg.common = (void *)job; - + n_from = 0; n_to = args -> m; @@ -424,17 +424,17 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ dnum = (double)n * (double)n /(double)nthreads; while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)i; - + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); - + if (num_cpu == 0) width = n - ((n - width) & ~mask); - + if ((width > n - i) || (width < mask)) width = n - i; - + } else { width = n - i; } @@ -449,7 +449,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } @@ -466,21 +466,21 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ dnum = (double)n * (double)n /(double)nthreads; while (i < n){ - + if (nthreads - num_cpu > 1) { - + double di = (double)i; - + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); - + if ((width > n - i) || (width < mask)) width = n - i; - + } else { width = n - i; } range[num_cpu + 1] = range[num_cpu] + width; - + queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_thread; queue[num_cpu].args = &newarg; @@ -489,7 +489,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; - + num_cpu ++; i += width; } @@ -507,14 +507,14 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ } } } - + queue[0].sa = sa; queue[0].sb = sb; queue[num_cpu - 1].next = NULL; - + exec_blas(num_cpu, queue); } - + #ifdef USE_ALLOC_HEAP free(job); #endif @@ -540,7 +540,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -548,14 +548,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif if (args -> nthreads == 1) { #ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); #else - info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); #endif return info; } @@ -584,7 +584,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; - + for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; @@ -643,7 +643,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.k = bk; newarg.a = a + (i + bk + i * lda) * COMPSIZE; newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; - + #if 0 HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else diff --git a/lapack/trti2/trti2_L.c b/lapack/trti2/trti2_L.c index 47fb53d09..f1c0ddf31 100644 --- a/lapack/trti2/trti2_L.c +++ b/lapack/trti2/trti2_L.c @@ -56,7 +56,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -77,7 +77,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, 1, sb); SCAL_K(n - j - 1, 0, 0, - -ajj, + -ajj, a + (j + 1) + j * lda, 1, NULL, 0, NULL, 0); } diff --git a/lapack/trti2/trti2_U.c b/lapack/trti2/trti2_U.c index f43cecdf2..376be731f 100644 --- a/lapack/trti2/trti2_U.c +++ b/lapack/trti2/trti2_U.c @@ -56,7 +56,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -72,12 +72,12 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #endif TRMV (j, - a , lda, + a , lda, a + j * lda, 1, sb); - SCAL_K(j, 0, 0, - -ajj, + SCAL_K(j, 0, 0, + -ajj, a + j * lda, 1, NULL, 0, NULL, 0); diff --git a/lapack/trti2/ztrti2_L.c b/lapack/trti2/ztrti2_L.c index fd19be284..819bff261 100644 --- a/lapack/trti2/ztrti2_L.c +++ b/lapack/trti2/ztrti2_L.c @@ -59,7 +59,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -92,9 +92,9 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, ZTRMV (n - j - 1, a + ((j + 1) + (j + 1) * lda) * COMPSIZE, lda, - a + ((j + 1) + j * lda) * COMPSIZE, 1, + a + ((j + 1) + j * lda) * COMPSIZE, 1, sb); - + SCAL_K(n - j - 1, 0, 0, -ajj_r, -ajj_i, a + ((j + 1) + j * lda) * COMPSIZE, 1, diff --git a/lapack/trti2/ztrti2_U.c b/lapack/trti2/ztrti2_U.c index d85b327eb..972329acd 100644 --- a/lapack/trti2/ztrti2_U.c +++ b/lapack/trti2/ztrti2_U.c @@ -59,7 +59,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; - + if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; @@ -92,15 +92,15 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, #endif ZTRMV (j, - a , lda, + a , lda, a + j * lda * COMPSIZE, 1, sb); - - SCAL_K(j, 0, 0, + + SCAL_K(j, 0, 0, -ajj_r, -ajj_i, a + j * lda * COMPSIZE, 1, NULL, 0, NULL, 0); - + } return 0; diff --git a/lapack/trtri/trtri_L_parallel.c b/lapack/trtri/trtri_L_parallel.c index 5969eb671..5dc60b862 100644 --- a/lapack/trtri/trtri_L_parallel.c +++ b/lapack/trtri/trtri_L_parallel.c @@ -67,7 +67,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -75,7 +75,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif n = args -> n; @@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, for (i = start_i; i >= 0; i -= blocking) { bk = n - i; if (bk > blocking) bk = blocking; - + range_N[0] = i; range_N[1] = i + bk; @@ -124,7 +124,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.a = a + (i + i * lda) * COMPSIZE; CNAME (&newarg, NULL, NULL, sa, sb, 0); - + newarg.m = n - bk - i; newarg.n = i; newarg.k = bk; diff --git a/lapack/trtri/trtri_U_parallel.c b/lapack/trtri/trtri_U_parallel.c index 8761a40c2..fc48a33f1 100644 --- a/lapack/trtri/trtri_U_parallel.c +++ b/lapack/trtri/trtri_U_parallel.c @@ -67,7 +67,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; -#endif +#endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; @@ -75,7 +75,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif +#endif #endif n = args -> n; @@ -120,7 +120,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.a = a + (i + i * lda) * COMPSIZE; CNAME (&newarg, NULL, NULL, sa, sb, 0); - + newarg.m = i; newarg.n = n - i - bk; newarg.k = bk; @@ -142,6 +142,6 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); } - + return 0; } diff --git a/param.h b/param.h index 94a2d1115..71af5c664 100644 --- a/param.h +++ b/param.h @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_P 224 #define QGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 +#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #define SGEMM_DEFAULT_Q 224 @@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define QGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 -#define ZGEMM_DEFAULT_P 112 +#define ZGEMM_DEFAULT_P 112 #define XGEMM_DEFAULT_P 56 #if defined(ARCH_X86_64) @@ -351,7 +351,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 448 #define DGEMM_DEFAULT_P 480 -#define ZGEMM_DEFAULT_P 112 +#define ZGEMM_DEFAULT_P 112 #define CGEMM_DEFAULT_P 224 #endif #define QGEMM_DEFAULT_P 112 @@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 -#define CGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 192 #define XGEMM_DEFAULT_Q 128 @@ -1225,7 +1225,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256 #define QGEMM_DEFAULT_Q 128 -#define CGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 #define ZGEMM_DEFAULT_Q 192 #define XGEMM_DEFAULT_Q 128 @@ -1242,7 +1242,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define DGEMM_DEFAULT_Q 256 #endif -#define CGEMM_DEFAULT_Q 192 +#define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r @@ -1826,7 +1826,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3A +#ifdef LOONGSON3A ////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 @@ -1848,7 +1848,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 44 +#define DGEMM_DEFAULT_P 44 #define CGEMM_DEFAULT_P 64 #define ZGEMM_DEFAULT_P 32 @@ -1857,8 +1857,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 640 -#define DGEMM_DEFAULT_R dgemm_r +#define SGEMM_DEFAULT_R 640 +#define DGEMM_DEFAULT_R dgemm_r #define CGEMM_DEFAULT_R 640 #define ZGEMM_DEFAULT_R 640 @@ -1899,7 +1899,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_Q 64 #define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 512 +#define DGEMM_DEFAULT_R 512 #define CGEMM_DEFAULT_R 512 #define ZGEMM_DEFAULT_R 512 diff --git a/reference/Makefile b/reference/Makefile index d6368dcda..fb52c860d 100644 --- a/reference/Makefile +++ b/reference/Makefile @@ -37,7 +37,7 @@ SBLAS2OBJS = \ SBLAS3OBJS = \ sgemmf.$(SUFFIX) ssymmf.$(SUFFIX) strmmf.$(SUFFIX) \ - strsmf.$(SUFFIX) ssyrkf.$(SUFFIX) ssyr2kf.$(SUFFIX) + strsmf.$(SUFFIX) ssyrkf.$(SUFFIX) ssyr2kf.$(SUFFIX) DBLAS1OBJS = \ daxpyf.$(SUFFIX) dswapf.$(SUFFIX) \ @@ -59,7 +59,7 @@ DBLAS2OBJS = \ DBLAS3OBJS = \ dgemmf.$(SUFFIX) dsymmf.$(SUFFIX) dtrmmf.$(SUFFIX) \ - dtrsmf.$(SUFFIX) dsyrkf.$(SUFFIX) dsyr2kf.$(SUFFIX) + dtrsmf.$(SUFFIX) dsyrkf.$(SUFFIX) dsyr2kf.$(SUFFIX) CBLAS1OBJS = \ caxpyf.$(SUFFIX) caxpycf.$(SUFFIX) cswapf.$(SUFFIX) \ @@ -140,7 +140,7 @@ DBLASOBJS += \ dpotf2f.$(SUFFIX) dpotrff.$(SUFFIX) dtrti2f.$(SUFFIX) dtrtrif.$(SUFFIX) \ dlaswpf.$(SUFFIX) dgetrsf.$(SUFFIX) dgesvf.$(SUFFIX) dpotrif.$(SUFFIX) \ -QBLASOBJS += +QBLASOBJS += # \ qgetf2f.$(SUFFIX) qgetrff.$(SUFFIX) qlauu2f.$(SUFFIX) qlauumf.$(SUFFIX) \ qpotf2f.$(SUFFIX) qpotrff.$(SUFFIX) qtrti2f.$(SUFFIX) qtrtrif.$(SUFFIX) \ @@ -156,7 +156,7 @@ ZBLASOBJS += \ zpotf2f.$(SUFFIX) zpotrff.$(SUFFIX) ztrti2f.$(SUFFIX) ztrtrif.$(SUFFIX) \ zlaswpf.$(SUFFIX) zgetrsf.$(SUFFIX) zgesvf.$(SUFFIX) zpotrif.$(SUFFIX) \ -XBLASOBJS += +XBLASOBJS += # \ xgetf2f.$(SUFFIX) xgetrff.$(SUFFIX) xlauu2f.$(SUFFIX) xlauumf.$(SUFFIX) \ xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ diff --git a/reference/cspmvf.f b/reference/cspmvf.f index 7f357c685..e32b4904d 100644 --- a/reference/cspmvf.f +++ b/reference/cspmvf.f @@ -78,7 +78,7 @@ * supplied as zero then Y need not be set on input. * Unchanged on exit. * -* Y (input/output) COMPLEX array, dimension at least +* Y (input/output) COMPLEX array, dimension at least * ( 1 + ( N - 1 )*abs( INCY ) ). * Before entry, the incremented array Y must contain the n * element vector y. On exit, Y is overwritten by the updated diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f index cd29ec572..340234270 100644 --- a/reference/ctpmvf.f +++ b/reference/ctpmvf.f @@ -140,7 +140,7 @@ IF( N.EQ.0 ) $ RETURN * - NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This diff --git a/reference/sgetrff.f b/reference/sgetrff.f index 139e7dee7..892386949 100644 --- a/reference/sgetrff.f +++ b/reference/sgetrff.f @@ -3,7 +3,7 @@ * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University -* March 31, 1993 +* March 31, 1993 * * .. Scalar Arguments .. INTEGER INFO, LDA, M, N diff --git a/reference/sgetrsf.f b/reference/sgetrsf.f index f00921868..0f14aedd9 100644 --- a/reference/sgetrsf.f +++ b/reference/sgetrsf.f @@ -3,7 +3,7 @@ * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University -* March 31, 1993 +* March 31, 1993 * * .. Scalar Arguments .. CHARACTER TRANS diff --git a/reference/spotrff.f b/reference/spotrff.f index 0a4925138..7297c81bc 100644 --- a/reference/spotrff.f +++ b/reference/spotrff.f @@ -3,7 +3,7 @@ * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University -* March 31, 1993 +* March 31, 1993 * * .. Scalar Arguments .. CHARACTER UPLO diff --git a/reference/strtrif.f b/reference/strtrif.f index 27e3234bb..39919e9c5 100644 --- a/reference/strtrif.f +++ b/reference/strtrif.f @@ -3,7 +3,7 @@ * -- LAPACK routine (version 3.0) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University -* March 31, 1993 +* March 31, 1993 * * .. Scalar Arguments .. CHARACTER DIAG, UPLO diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f index d05027216..7e52ef74e 100644 --- a/reference/ztpmvf.f +++ b/reference/ztpmvf.f @@ -140,7 +140,7 @@ IF( N.EQ.0 ) $ RETURN * - NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) NOUNIT = LSAME( DIAG , 'N' ) * * Set up the start point in X if the increment is not unity. This diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f index db0f9ca04..9e4f85380 100644 --- a/reference/ztrmvf.f +++ b/reference/ztrmvf.f @@ -174,7 +174,7 @@ X( I ) = X( I ) + TEMP*A( I, J ) ELSE X( I ) = X( I ) + TEMP*DCONJG(A( I, J )) - ENDIF + ENDIF 10 CONTINUE IF (NOCONJ) THEN IF( NOUNIT ) diff --git a/segfaults.patch b/segfaults.patch index 375ab766c..0087f7723 100644 --- a/segfaults.patch +++ b/segfaults.patch @@ -9,4 +9,4 @@ diff -ruN common_linux.h.orig common_linux.h + return 0; #endif } - + diff --git a/symcopy.h b/symcopy.h index ed6e5b417..48ccbd369 100644 --- a/symcopy.h +++ b/symcopy.h @@ -61,11 +61,11 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 2; - + bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m + 2; - + cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2 * m + 2; @@ -74,9 +74,9 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); - + a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a21; @@ -85,7 +85,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 2; bb1 += 2; bb2 += 2; - + cc1 += 2 * m; cc2 += 2 * m; @@ -96,10 +96,10 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + aa1 += 2; aa2 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; @@ -124,7 +124,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (is == 1){ a11 = *(aa1 + 0); a12 = *(aa2 + 0); - + *(bb1 + 0) = a11; *(bb2 + 0) = a12; @@ -132,7 +132,7 @@ static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 1) = a12; } } - + if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; @@ -159,11 +159,11 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2; @@ -171,7 +171,7 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); @@ -179,48 +179,48 @@ static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 2; aa2 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; - + *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; - + bb1 += 2; bb2 += 2; - + cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a12; *(bb2 + 0) = a12; *(bb2 + 1) = a22; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); aa1 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(cc1 + 0) = a11; *(cc2 + 0) = a21; bb1 += 2; - + cc1 += 2 * m; cc2 += 2 * m; } @@ -252,11 +252,11 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; @@ -267,10 +267,10 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); - + a12 = *(aa2 + 2); a22 = *(aa2 + 3); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -285,7 +285,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 4; bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; @@ -301,10 +301,10 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -339,7 +339,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; @@ -351,7 +351,7 @@ static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 3) = a22; } } - + if (m - js == 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); @@ -382,11 +382,11 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; @@ -394,7 +394,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); @@ -407,7 +407,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -417,7 +417,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; - + *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; @@ -427,22 +427,22 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; - + bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a12; @@ -453,16 +453,16 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 2) = a32; *(bb2 + 3) = a42; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -473,7 +473,7 @@ static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } @@ -506,11 +506,11 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; @@ -520,9 +520,9 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a11 = *(aa1 + 0); a31 = *(aa1 + 2); a41 = *(aa1 + 3); - + a12 = *(aa2 + 2); - + *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a31; @@ -537,7 +537,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 4; bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; @@ -553,10 +553,10 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -591,7 +591,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; @@ -603,7 +603,7 @@ static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 3) = -a22; } } - + if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; @@ -633,11 +633,11 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; @@ -645,7 +645,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); @@ -658,7 +658,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -668,7 +668,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; - + *(cc1 + 0) = a11; *(cc1 + 1) = -a21; *(cc1 + 2) = a12; @@ -678,20 +678,20 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 1) = -a41; *(cc2 + 2) = a32; *(cc2 + 3) = -a42; - + bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); - + *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a12; @@ -702,16 +702,16 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 2) = a32; *(bb2 + 3) = 0.; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -722,7 +722,7 @@ static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 0) = a31; *(cc2 + 1) = -a41; bb1 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } @@ -755,11 +755,11 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; @@ -769,9 +769,9 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a11 = *(aa1 + 0); a31 = *(aa1 + 2); a41 = *(aa1 + 3); - + a12 = *(aa2 + 2); - + *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a31; @@ -786,7 +786,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 4; bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; @@ -802,10 +802,10 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb1 + 2) = a31; @@ -840,7 +840,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb2 + 0) = a12; @@ -852,7 +852,7 @@ static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 3) = a22; } } - + if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; @@ -882,11 +882,11 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; @@ -894,7 +894,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); @@ -907,7 +907,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb1 + 2) = a31; @@ -917,7 +917,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 1) = -a22; *(bb2 + 2) = a32; *(bb2 + 3) = -a42; - + *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; @@ -927,20 +927,20 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; - + bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); - + *(bb1 + 0) = a11; *(bb1 + 1) = 0.; *(bb1 + 2) = a12; @@ -951,16 +951,16 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 2) = a32; *(bb2 + 3) = 0.; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = -a21; *(bb1 + 2) = a31; @@ -971,7 +971,7 @@ static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } @@ -1002,11 +1002,11 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 2; - + bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m + 2; - + cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2 * m + 2; @@ -1015,9 +1015,9 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); - + a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a21; @@ -1026,7 +1026,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 2; bb1 += 2; bb2 += 2; - + cc1 += 2 * m; cc2 += 2 * m; @@ -1037,10 +1037,10 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + aa1 += 2; aa2 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; @@ -1065,7 +1065,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (is == 1){ a11 = *(aa1 + 0); a12 = *(aa2 + 0); - + *(bb1 + 0) = a11; *(bb2 + 0) = a12; @@ -1073,7 +1073,7 @@ static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 1) = a12; } } - + if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; @@ -1100,11 +1100,11 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 2; - + bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m + 2; - + cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2 * m + 2; @@ -1113,9 +1113,9 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); - + a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a21; @@ -1124,7 +1124,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 2; bb1 += 2; bb2 += 2; - + cc1 += 2 * m; cc2 += 2 * m; @@ -1135,10 +1135,10 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + aa1 += 2; aa2 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; @@ -1163,7 +1163,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (is == 1){ a11 = *(aa1 + 0); a12 = *(aa2 + 0); - + *(bb1 + 0) = a11; *(bb2 + 0) = a12; @@ -1171,7 +1171,7 @@ static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 1) = a12; } } - + if (m - js == 1){ a11 = *(aa1 + 0); *(bb1 + 0) = a11; @@ -1198,11 +1198,11 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2; @@ -1210,7 +1210,7 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); @@ -1218,48 +1218,48 @@ static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 2; aa2 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; - + *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; - + bb1 += 2; bb2 += 2; - + cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a12; *(bb2 + 0) = a12; *(bb2 + 1) = a22; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); aa1 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(cc1 + 0) = a11; *(cc2 + 0) = a21; bb1 += 2; - + cc1 += 2 * m; cc2 += 2 * m; } @@ -1288,11 +1288,11 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 1 * m; b1 += 2 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 1 * m; b2 += 2; @@ -1300,7 +1300,7 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a12 = *(aa2 + 0); @@ -1308,48 +1308,48 @@ static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 2; aa2 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; *(bb2 + 1) = a22; - + *(cc1 + 0) = a11; *(cc1 + 1) = a12; *(cc2 + 0) = a21; *(cc2 + 1) = a22; - + bb1 += 2; bb2 += 2; - + cc1 += 2 * m; cc2 += 2 * m; } a11 = *(aa1 + 0); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a12; *(bb2 + 0) = a12; *(bb2 + 1) = a22; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); aa1 += 2; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(cc1 + 0) = a11; *(cc2 + 0) = a21; bb1 += 2; - + cc1 += 2 * m; cc2 += 2 * m; } @@ -1380,11 +1380,11 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; @@ -1395,10 +1395,10 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); - + a12 = *(aa2 + 2); a22 = *(aa2 + 3); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1413,7 +1413,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 4; bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; @@ -1429,10 +1429,10 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1467,7 +1467,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; @@ -1479,7 +1479,7 @@ static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 3) = a22; } } - + if (m - js == 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); @@ -1510,11 +1510,11 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda + 4; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m + 4; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4 * m + 4; @@ -1525,10 +1525,10 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); - + a12 = *(aa2 + 2); a22 = *(aa2 + 3); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1543,7 +1543,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa2 += 4; bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; @@ -1559,10 +1559,10 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1597,7 +1597,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ a21 = *(aa1 + 1); a12 = *(aa2 + 0); a22 = *(aa2 + 1); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb2 + 0) = a12; @@ -1609,7 +1609,7 @@ static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc1 + 3) = a22; } } - + if (m - js == 1){ a11 = *(aa1 + 0); a21 = *(aa1 + 1); @@ -1640,11 +1640,11 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; @@ -1652,7 +1652,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); @@ -1665,7 +1665,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1675,7 +1675,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; - + *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; @@ -1685,22 +1685,22 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; - + bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a12; @@ -1711,16 +1711,16 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 2) = a32; *(bb2 + 3) = a42; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1731,7 +1731,7 @@ static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } @@ -1764,11 +1764,11 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 = a + 0 * lda; aa2 = a + 1 * lda; a += 2 * lda; - + bb1 = b1 + 0 * m; bb2 = b1 + 2 * m; b1 += 4 * m; - + cc1 = b2 + 0 * m; cc2 = b2 + 2 * m; b2 += 4; @@ -1776,7 +1776,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ if (m - js >= 2){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); @@ -1789,7 +1789,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ aa1 += 4; aa2 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1799,7 +1799,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 1) = a22; *(bb2 + 2) = a32; *(bb2 + 3) = a42; - + *(cc1 + 0) = a11; *(cc1 + 1) = a21; *(cc1 + 2) = a12; @@ -1809,22 +1809,22 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 1) = a41; *(cc2 + 2) = a32; *(cc2 + 3) = a42; - + bb1 += 4; bb2 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } a11 = *(aa1 + 0); a21 = *(aa1 + 1); - + a12 = *(aa2 + 0); a22 = *(aa2 + 1); a32 = *(aa2 + 2); a42 = *(aa2 + 3); - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a12; @@ -1835,16 +1835,16 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(bb2 + 2) = a32; *(bb2 + 3) = a42; } - + if (m - js == 1){ for (is = 0; is < js; is += 2){ - + a11 = *(aa1 + 0); a21 = *(aa1 + 1); a31 = *(aa1 + 2); a41 = *(aa1 + 3); aa1 += 4; - + *(bb1 + 0) = a11; *(bb1 + 1) = a21; *(bb1 + 2) = a31; @@ -1855,7 +1855,7 @@ static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ *(cc2 + 0) = a31; *(cc2 + 1) = a41; bb1 += 4; - + cc1 += 4 * m; cc2 += 4 * m; } diff --git a/test/Makefile b/test/Makefile index 0bc06e85f..801efe244 100644 --- a/test/Makefile +++ b/test/Makefile @@ -89,7 +89,7 @@ endif endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -CEXTRALIB = +CEXTRALIB = sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) diff --git a/test/get_threading_model.c b/test/get_threading_model.c index 9a6835bad..3f34a333a 100644 --- a/test/get_threading_model.c +++ b/test/get_threading_model.c @@ -1,18 +1,18 @@ -#include "../cblas.h" +#include "../cblas.h" int main() { - int th_model = openblas_get_parallel(); + int th_model = openblas_get_parallel(); switch(th_model) { - case OPENBLAS_SEQUENTIAL: - printf("OpenBLAS is compiled sequentially.\n"); - break; - case OPENBLAS_THREAD: - printf("OpenBLAS is compiled using the normal threading model\n"); - break; - case OPENBLAS_OPENMP: - printf("OpenBLAS is compiled using OpenMP\n"); - break; + case OPENBLAS_SEQUENTIAL: + printf("OpenBLAS is compiled sequentially.\n"); + break; + case OPENBLAS_THREAD: + printf("OpenBLAS is compiled using the normal threading model\n"); + break; + case OPENBLAS_OPENMP: + printf("OpenBLAS is compiled using OpenMP\n"); + break; } - return 0; + return 0; } diff --git a/test/sblat2.f b/test/sblat2.f index 057a85429..a1074be52 100644 --- a/test/sblat2.f +++ b/test/sblat2.f @@ -2886,7 +2886,7 @@ WRITE( NOUT, FMT = 9998 )I, YT( I ), $ YY( 1 + ( I - 1 )*ABS( INCY ) ) ELSE - WRITE( NOUT, FMT = 9998 )I, + WRITE( NOUT, FMT = 9998 )I, $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) END IF 60 CONTINUE diff --git a/utest/Makefile b/utest/Makefile index 38ebb03df..31cb93176 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -45,7 +45,7 @@ run_test: $(TARGET) ./$(TARGET) clean: - -rm -f *.o $(TARGET) + -rm -f *.o $(TARGET) -rm -rf $(CUNIT_DIR) libs: diff --git a/utest/common_utest.h b/utest/common_utest.h index f3841c58e..51f04cac7 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ diff --git a/utest/main.c b/utest/main.c index ca50e473e..7fb5811f8 100644 --- a/utest/main.c +++ b/utest/main.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -81,29 +81,29 @@ int main() CU_ErrorCode error; if (CUE_SUCCESS != CU_initialize_registry()) return CU_get_error(); - + error=CU_register_suites(suites); - + if (error != CUE_SUCCESS) { perror(CU_get_error_msg()); CU_cleanup_registry(); return CU_get_error(); - + } - - + + printf("Seting OK\n"); fflush(stdout); - + /* Run all tests using the CUnit Basic interface */ CU_basic_set_mode(CU_BRM_VERBOSE); - + CU_basic_run_tests(); - + CU_cleanup_registry(); - + return CU_get_error(); - + } diff --git a/utest/test_amax.c b/utest/test_amax.c index 8d163853a..fcc9343cf 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -41,6 +41,6 @@ void test_samax() te_max=BLASFUNC(samax)(&N, x, &inc); tr_max=BLASFUNC_REF(samax)(&N, x, &inc); - + CU_ASSERT_DOUBLE_EQUAL(te_max, tr_max, CHECK_EPS); } diff --git a/utest/test_axpy.c b/utest/test_axpy.c index a141d7a11..0355973f5 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ diff --git a/utest/test_dotu.c b/utest/test_dotu.c index 60bb3a6da..aef1005dc 100644 --- a/utest/test_dotu.c +++ b/utest/test_dotu.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -50,7 +50,7 @@ void test_zdotu_n_1(void) CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); // printf("\%lf,%lf\n",creal(result1),cimag(result1)); - + } void test_zdotu_offset_1(void) @@ -70,6 +70,6 @@ void test_zdotu_offset_1(void) CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); // printf("\%lf,%lf\n",creal(result1),cimag(result1)); - + } diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c index 8df7380be..41b62c2ea 100644 --- a/utest/test_dsdot.c +++ b/utest/test_dsdot.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -39,12 +39,12 @@ void test_dsdot_n_1() int incx=1; int incy=1; int n=1; - + double res1=0.0f, res2=0.0f; res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy); CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS); - + } diff --git a/utest/test_fork.c b/utest/test_fork.c index 1d8804ac6..6e99d1444 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -61,20 +61,20 @@ void test_fork_safety(void) { int n = 1000; int i; - + double *a, *b, *c, *d; size_t n_bytes; - + pid_t fork_pid; pid_t fork_pid_nested; n_bytes = sizeof(*a) * n * n; - + a = xmalloc(n_bytes); b = xmalloc(n_bytes); c = xmalloc(n_bytes); d = xmalloc(n_bytes); - + // Put ones in a and b for(i = 0; i < n * n; ++i) { a[i] = 1; diff --git a/utest/test_rot.c b/utest/test_rot.c index f5332d486..988f54e9c 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index 9a1a3d084..bb03c278a 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ @@ -54,7 +54,7 @@ void test_drotmg() BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); //reference BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param); - + CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS); diff --git a/version.h b/version.h index 5c621e686..213faae00 100644 --- a/version.h +++ b/version.h @@ -13,19 +13,19 @@ met: notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - 3. Neither the name of the ISCAS nor the names of its contributors may - be used to endorse or promote products derived from this software + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ -- cgit v1.2.3