summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Changelog.txt17
-rw-r--r--Makefile.rule2
-rw-r--r--Makefile.system3
-rw-r--r--README.md19
-rw-r--r--TargetList.txt6
-rw-r--r--benchmark/Makefile211
-rw-r--r--benchmark/axpy.c201
-rw-r--r--benchmark/dot.c195
-rw-r--r--benchmark/gemm.c4
-rw-r--r--benchmark/gemm3m.c212
-rw-r--r--benchmark/gemv.c110
-rw-r--r--benchmark/hemv.c208
-rw-r--r--benchmark/tplot-header42
-rw-r--r--cblas.h23
-rw-r--r--cblas_noconst.h22
-rw-r--r--common_param.h109
-rw-r--r--common_x86_64.h8
-rw-r--r--cpuid_x86.c9
-rw-r--r--ctest/Makefile20
-rw-r--r--ctest/c_c3chke.c234
-rw-r--r--ctest/c_cblas3.c82
-rw-r--r--ctest/c_cblat3_3m.f2786
-rw-r--r--ctest/c_z3chke.c238
-rw-r--r--ctest/c_zblas3.c79
-rw-r--r--ctest/c_zblat3_3m.f2791
-rw-r--r--ctest/cblas_test.h6
-rw-r--r--ctest/cin3_3m22
-rw-r--r--ctest/zin3_3m22
-rw-r--r--driver/level3/Makefile4
-rw-r--r--driver/others/parameter.c4
-rw-r--r--exports/gensymbol6
-rw-r--r--interface/Makefile63
-rw-r--r--interface/gemm.c41
-rw-r--r--interface/gemv.c2
-rw-r--r--interface/imatcopy.c23
-rw-r--r--interface/omatcopy.c21
-rw-r--r--interface/zgemv.c2
-rw-r--r--interface/zimatcopy.c26
-rw-r--r--interface/zomatcopy.c21
-rw-r--r--kernel/arm/symv_L.c70
-rw-r--r--kernel/arm/symv_U.c71
-rw-r--r--kernel/setparam-ref.c90
-rw-r--r--kernel/x86_64/KERNEL.BULLDOZER18
-rw-r--r--kernel/x86_64/KERNEL.HASWELL16
-rw-r--r--kernel/x86_64/KERNEL.NEHALEM16
-rw-r--r--kernel/x86_64/KERNEL.PILEDRIVER7
-rw-r--r--kernel/x86_64/KERNEL.SANDYBRIDGE6
-rw-r--r--kernel/x86_64/caxpy.c131
-rw-r--r--kernel/x86_64/caxpy_microk_bulldozer-2.c135
-rw-r--r--kernel/x86_64/cgemm_kernel_8x2_haswell.S16
-rw-r--r--kernel/x86_64/cgemv_n.c255
-rw-r--r--kernel/x86_64/cgemv_n_4.c623
-rw-r--r--kernel/x86_64/cgemv_n_microk_haswell-2.c137
-rw-r--r--kernel/x86_64/cgemv_n_microk_haswell-4.c542
-rw-r--r--kernel/x86_64/cgemv_t.c265
-rw-r--r--kernel/x86_64/cgemv_t_4.c579
-rw-r--r--kernel/x86_64/cgemv_t_microk_haswell-2.c171
-rw-r--r--kernel/x86_64/cgemv_t_microk_haswell-4.c539
-rw-r--r--kernel/x86_64/daxpy.c105
-rw-r--r--kernel/x86_64/daxpy_microk_bulldozer-2.c82
-rw-r--r--kernel/x86_64/daxpy_microk_nehalem-2.c91
-rw-r--r--kernel/x86_64/ddot.c110
-rw-r--r--kernel/x86_64/ddot_microk_bulldozer-2.c (renamed from kernel/x86_64/sgemv_n_microk_haswell-2.c)64
-rw-r--r--kernel/x86_64/ddot_microk_nehalem-2.c94
-rw-r--r--kernel/x86_64/dgemv_n.c206
-rw-r--r--kernel/x86_64/dgemv_n_4.c548
-rw-r--r--kernel/x86_64/dgemv_n_microk_haswell-4.c247
-rw-r--r--kernel/x86_64/dgemv_n_microk_nehalem-4.c265
-rw-r--r--kernel/x86_64/dgemv_t.c191
-rw-r--r--kernel/x86_64/dgemv_t_4.c615
-rw-r--r--kernel/x86_64/dgemv_t_microk_haswell-4.c (renamed from kernel/x86_64/dgemv_t_microk_haswell-2.c)48
-rw-r--r--kernel/x86_64/dsymv_L.c299
-rw-r--r--kernel/x86_64/dsymv_L_microk_bulldozer-2.c137
-rw-r--r--kernel/x86_64/dsymv_L_microk_nehalem-2.c132
-rw-r--r--kernel/x86_64/dsymv_U.c273
-rw-r--r--kernel/x86_64/dsymv_U_microk_bulldozer-2.c130
-rw-r--r--kernel/x86_64/dsymv_U_microk_nehalem-2.c125
-rw-r--r--kernel/x86_64/saxpy.c103
-rw-r--r--kernel/x86_64/saxpy_microk_nehalem-2.c91
-rw-r--r--kernel/x86_64/sdot.c109
-rw-r--r--kernel/x86_64/sdot_microk_bulldozer-2.c (renamed from kernel/x86_64/dgemv_n_microk_haswell-2.c)66
-rw-r--r--kernel/x86_64/sdot_microk_nehalem-2.c94
-rw-r--r--kernel/x86_64/sgemm_kernel_16x4_haswell.S20
-rw-r--r--kernel/x86_64/sgemv_n.c11
-rw-r--r--kernel/x86_64/sgemv_n_4.c591
-rw-r--r--kernel/x86_64/sgemv_n_avx.c218
-rw-r--r--kernel/x86_64/sgemv_n_microk_bulldozer-2.c99
-rw-r--r--kernel/x86_64/sgemv_n_microk_bulldozer-4.c269
-rw-r--r--kernel/x86_64/sgemv_n_microk_bulldozer.c451
-rw-r--r--kernel/x86_64/sgemv_n_microk_haswell-4.c299
-rw-r--r--kernel/x86_64/sgemv_n_microk_haswell.c461
-rw-r--r--kernel/x86_64/sgemv_n_microk_nehalem-2.c144
-rw-r--r--kernel/x86_64/sgemv_n_microk_nehalem-4.c204
-rw-r--r--kernel/x86_64/sgemv_n_microk_sandy-4.c370
-rw-r--r--kernel/x86_64/sgemv_n_microk_sandy.c473
-rw-r--r--kernel/x86_64/sgemv_t.c10
-rw-r--r--kernel/x86_64/sgemv_t_4.c624
-rw-r--r--kernel/x86_64/sgemv_t_avx.c232
-rw-r--r--kernel/x86_64/sgemv_t_microk_bulldozer-4.c (renamed from kernel/x86_64/sgemv_t_microk_bulldozer-2.c)68
-rw-r--r--kernel/x86_64/sgemv_t_microk_bulldozer.c99
-rw-r--r--kernel/x86_64/sgemv_t_microk_haswell-4.c (renamed from kernel/x86_64/sgemv_t_microk_haswell-2.c)42
-rw-r--r--kernel/x86_64/sgemv_t_microk_haswell.c100
-rw-r--r--kernel/x86_64/sgemv_t_microk_nehalem-2.c159
-rw-r--r--kernel/x86_64/sgemv_t_microk_nehalem-4.c (renamed from kernel/x86_64/sgemv_n_microk_sandy-2.c)78
-rw-r--r--kernel/x86_64/sgemv_t_microk_sandy-4.c (renamed from kernel/x86_64/sgemv_t_microk_sandy-2.c)68
-rw-r--r--kernel/x86_64/sgemv_t_microk_sandy.c106
-rw-r--r--kernel/x86_64/ssymv_L.c299
-rw-r--r--kernel/x86_64/ssymv_L_microk_bulldozer-2.c122
-rw-r--r--kernel/x86_64/ssymv_L_microk_nehalem-2.c137
-rw-r--r--kernel/x86_64/ssymv_U.c273
-rw-r--r--kernel/x86_64/ssymv_U_microk_bulldozer-2.c114
-rw-r--r--kernel/x86_64/ssymv_U_microk_nehalem-2.c130
-rw-r--r--kernel/x86_64/zaxpy.c131
-rw-r--r--kernel/x86_64/zaxpy_microk_bulldozer-2.c135
-rw-r--r--kernel/x86_64/zgemm_kernel_4x2_haswell.S12
-rw-r--r--kernel/x86_64/zgemv_n.c258
-rw-r--r--kernel/x86_64/zgemv_n_4.c626
-rw-r--r--kernel/x86_64/zgemv_n_microk_haswell-2.c137
-rw-r--r--kernel/x86_64/zgemv_n_microk_haswell-4.c400
-rw-r--r--kernel/x86_64/zgemv_n_microk_sandy-2.c149
-rw-r--r--kernel/x86_64/zgemv_n_microk_sandy-4.c417
-rw-r--r--kernel/x86_64/zgemv_t.c272
-rw-r--r--kernel/x86_64/zgemv_t_4.c583
-rw-r--r--kernel/x86_64/zgemv_t_microk_bulldozer-2.c180
-rw-r--r--kernel/x86_64/zgemv_t_microk_bulldozer-4.c457
-rw-r--r--kernel/x86_64/zgemv_t_microk_haswell-2.c162
-rw-r--r--kernel/x86_64/zgemv_t_microk_haswell-4.c428
-rw-r--r--lapack-netlib/TESTING/dstest.in2
-rw-r--r--param.h63
-rw-r--r--test/Makefile36
-rw-r--r--test/cblat3_3m.dat23
-rw-r--r--test/cblat3_3m.f3442
-rw-r--r--test/zblat3_3m.dat23
-rw-r--r--test/zblat3_3m.f3448
134 files changed, 28183 insertions, 5278 deletions
diff --git a/Changelog.txt b/Changelog.txt
index d33cffc7b..e9fe824ca 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,6 +1,21 @@
OpenBLAS ChangeLog
====================================================================
-Version 0.2.10
+Version 0.2.12
+13-Oct-2014
+common:
+ * Added CBLAS interface for ?omatcopy and ?imatcopy.
+ * Enable ?gemm3m functions.
+ * Added benchmark for ?gemm3m.
+ * Optimized multithreading lower limits.
+ * Disabled SYMM3M and HEMM3M functions
+ because of segment violations.
+
+x86/x86-64:
+ * Improved axpy and symv performance on AMD Bulldozer.
+ * Improved gemv performance on modern Intel and AMD CPUs.
+
+====================================================================
+Version 0.2.11
18-Aug-2014
common:
* Added some benchmark codes.
diff --git a/Makefile.rule b/Makefile.rule
index 7430320b7..7f0356fff 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
#
# This library's version
-VERSION = 0.2.11
+VERSION = 0.2.12
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.system b/Makefile.system
index ccde8e9ce..d2ff74146 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -339,7 +339,7 @@ FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
-CCOMMON_OPT += -DEXPRECISION
+CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
@@ -350,6 +350,7 @@ ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -wd981
endif
+
ifeq ($(USE_OPENMP), 1)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
diff --git a/README.md b/README.md
index 2e85117c9..f4c547701 100644
--- a/README.md
+++ b/README.md
@@ -55,16 +55,23 @@ Please read GotoBLAS_01Readme.txt
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
-- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
-- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
+- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
+- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
-- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
-- **AMD PILEDRIVER**: Used Bulldozer codes.
+- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
+- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental
+#### ARM:
+- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
+- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
+
+#### ARM64:
+- **ARMV8**: Experimental
+
### Support OS:
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
@@ -116,8 +123,8 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
-* The number of CPUs/Cores should less than or equal to 256.
-* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
+* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
+* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Contributing
diff --git a/TargetList.txt b/TargetList.txt
index ce35a3faa..97661fdcf 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -19,6 +19,7 @@ PENRYN
DUNNINGTON
NEHALEM
SANDYBRIDGE
+HASWELL
ATOM
b)AMD CPU:
@@ -30,6 +31,7 @@ SHANGHAI
ISTANBUL
BOBCAT
BULLDOZER
+PILEDRIVER
c)VIA CPU:
SSE_GENERIC
@@ -59,3 +61,7 @@ ITANIUM2
SPARC
SPARCV7
+6.ARM CPU:
+ARMV7
+ARMV6
+ARMV5
diff --git a/benchmark/Makefile b/benchmark/Makefile
index de94dcc59..cf219cef1 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -35,7 +35,10 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
sger.goto dger.goto \
- ssymv.goto dsymv.goto \
+ sdot.goto ddot.goto \
+ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
+ ssymv.goto dsymv.goto csymv.goto zsymv.goto \
+ chemv.goto zhemv.goto \
chemm.goto zhemm.goto \
cherk.goto zherk.goto \
cher2k.goto zher2k.goto \
@@ -53,7 +56,10 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
sger.acml dger.acml \
- ssymv.acml dsymv.acml \
+ sdot.acml ddot.acml \
+ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
+ ssymv.acml dsymv.acml csymv.acml zsymv.acml \
+ chemv.acml zhemv.acml \
chemm.acml zhemm.acml \
cherk.acml zherk.acml \
cher2k.acml zher2k.acml \
@@ -71,7 +77,10 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
sger.atlas dger.atlas \
- ssymv.atlas dsymv.atlas \
+ sdot.atlas ddot.atlas \
+ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
+ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
+ chemv.atlas zhemv.atlas \
chemm.acml zhemm.acml \
chemm.atlas zhemm.atlas \
cherk.atlas zherk.atlas \
@@ -90,7 +99,10 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
sger.mkl dger.mkl \
- ssymv.mkl dsymv.mkl \
+ sdot.mkl ddot.mkl \
+ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
+ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
+ chemv.mkl zhemv.mkl \
chemm.mkl zhemm.mkl \
cherk.mkl zherk.mkl \
cher2k.mkl zher2k.mkl \
@@ -100,7 +112,12 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl
-all :: goto atlas acml mkl
+
+goto_3m :: cgemm3m.goto zgemm3m.goto
+
+mkl_3m :: cgemm3m.mkl zgemm3m.mkl
+
+all :: goto mkl atlas acml
##################################### Slinpack ####################################################
slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME)
@@ -732,6 +749,32 @@ dsymv.atlas : dsymv.$(SUFFIX)
dsymv.mkl : dsymv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Csymv ####################################################
+csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+csymv.acml : csymv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymv.atlas : csymv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymv.mkl : csymv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsymv ####################################################
+zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zsymv.acml : zsymv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymv.atlas : zsymv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymv.mkl : zsymv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
##################################### Sgeev ####################################################
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
@@ -896,6 +939,131 @@ zpotrf.atlas : zpotrf.$(SUFFIX)
zpotrf.mkl : zpotrf.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Chemv ####################################################
+
+chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+chemv.acml : chemv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemv.atlas : chemv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemv.mkl : chemv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zhemv ####################################################
+
+zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zhemv.acml : zhemv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemv.atlas : zhemv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemv.mkl : zhemv.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sdot ####################################################
+sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+sdot.acml : sdot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sdot.atlas : sdot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sdot.mkl : sdot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ddot ####################################################
+ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+ddot.acml : ddot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ddot.atlas : ddot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ddot.mkl : ddot.$(SUFFIX)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Saxpy ####################################################
+saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+saxpy.acml : saxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpy.atlas : saxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpy.mkl : saxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Daxpy ####################################################
+daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+daxpy.acml : daxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpy.atlas : daxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpy.mkl : daxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Caxpy ####################################################
+
+caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+caxpy.acml : caxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpy.atlas : caxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpy.mkl : caxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zaxpy ####################################################
+
+zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zaxpy.acml : zaxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpy.atlas : zaxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpy.mkl : zaxpy.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
+##################################### Cgemm3m ####################################################
+
+cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+cgemm3m.mkl : cgemm3m.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgemm3m ####################################################
+
+zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME)
+ $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm
+
+zgemm3m.mkl : zgemm3m.$(SUFFIX)
+ -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
###################################################################################################
@@ -1037,6 +1205,12 @@ ssymv.$(SUFFIX) : symv.c
dsymv.$(SUFFIX) : symv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+csymv.$(SUFFIX) : symv.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsymv.$(SUFFIX) : symv.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
sgeev.$(SUFFIX) : geev.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
@@ -1073,8 +1247,35 @@ cpotrf.$(SUFFIX) : potrf.c
zpotrf.$(SUFFIX) : potrf.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+chemv.$(SUFFIX) : hemv.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zhemv.$(SUFFIX) : hemv.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sdot.$(SUFFIX) : dot.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+ddot.$(SUFFIX) : dot.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+saxpy.$(SUFFIX) : axpy.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+daxpy.$(SUFFIX) : axpy.c
+ $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+caxpy.$(SUFFIX) : axpy.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+zaxpy.$(SUFFIX) : axpy.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+cgemm3m.$(SUFFIX) : gemm3m.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgemm3m.$(SUFFIX) : gemm3m.c
+ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
clean ::
diff --git a/benchmark/axpy.c b/benchmark/axpy.c
new file mode 100644
index 000000000..ef3b5ae4f
--- /dev/null
+++ b/benchmark/axpy.c
@@ -0,0 +1,201 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef AXPY
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define AXPY BLASFUNC(zaxpy)
+#else
+#define AXPY BLASFUNC(caxpy)
+#endif
+#else
+#ifdef DOUBLE
+#define AXPY BLASFUNC(daxpy)
+#else
+#define AXPY BLASFUNC(saxpy)
+#endif
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *x, *y;
+ FLOAT alpha[2] = { 2.0, 2.0 };
+ blasint m, i;
+ blasint inc_x=1,inc_y=1;
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
+ if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
+ if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
+
+ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+
+ for (l=0; l<loops; l++)
+ {
+
+ for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+
+ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
+ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ gettimeofday( &start, (struct timezone *)0);
+
+ AXPY (&m, alpha, x, &inc_x, y, &inc_y );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ timeg += time1;
+
+ }
+
+ timeg /= loops;
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/dot.c b/benchmark/dot.c
new file mode 100644
index 000000000..6132ed324
--- /dev/null
+++ b/benchmark/dot.c
@@ -0,0 +1,195 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef DOT
+
+
+#ifdef DOUBLE
+#define DOT BLASFUNC(ddot)
+#else
+#define DOT BLASFUNC(sdot)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *x, *y;
+ FLOAT result;
+ blasint m, i;
+ blasint inc_x=1,inc_y=1;
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
+ if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
+ if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
+
+ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+
+ for (l=0; l<loops; l++)
+ {
+
+ for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+
+ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
+ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ gettimeofday( &start, (struct timezone *)0);
+
+ result = DOT (&m, x, &inc_x, y, &inc_y );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ timeg += time1;
+
+ }
+
+ timeg /= loops;
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index fc482c075..4f9a58825 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -142,7 +142,9 @@ int MAIN__(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
- fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+
+ fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c
new file mode 100644
index 000000000..048d74be6
--- /dev/null
+++ b/benchmark/gemm3m.c
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef GEMM
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define GEMM BLASFUNC(dgemm)
+#else
+#define GEMM BLASFUNC(sgemm)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define GEMM BLASFUNC(zgemm3m)
+#else
+#define GEMM BLASFUNC(cgemm3m)
+#endif
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *b, *c;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char trans='N';
+ blasint m, i, j;
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
+
+ fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ p = getenv("OPENBLAS_LOOPS");
+ if ( p != NULL )
+ loops = atoi(p);
+
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6d : ", (int)m);
+
+ for (l=0; l<loops; l++)
+ {
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+ gettimeofday( &start, (struct timezone *)0);
+
+ GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ timeg += time1;
+
+ }
+
+ timeg /= loops;
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index e26a36ac1..e21868259 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){
blasint inc_x=1,inc_y=1;
blasint n=0;
int has_param_n = 0;
+ int has_param_m = 0;
int loops = 1;
int l;
char *p;
@@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ int tomax = to;
+
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
- if ((n>0) && (n<=to)) has_param_n = 1;
+ if ((n>0)) has_param_n = 1;
+ if ( n > tomax ) tomax = n;
}
+ if ( has_param_n == 0 )
+ if ((p = getenv("OPENBLAS_PARAM_M"))) {
+ m = atoi(p);
+ if ((m>0)) has_param_m = 1;
+ if ( m > tomax ) tomax = m;
+ }
- if ( has_param_n == 1 )
- fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
- else
- fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
- if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
- if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
- if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+ if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
@@ -177,50 +187,80 @@ int MAIN__(int argc, char *argv[]){
fprintf(stderr, " SIZE Flops\n");
- for(m = from; m <= to; m += step)
+ if (has_param_m == 0)
{
- timeg=0;
+ for(m = from; m <= to; m += step)
+ {
+ timeg=0;
+ if ( has_param_n == 0 ) n = m;
+ fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+ for(j = 0; j < m; j++){
+ for(i = 0; i < n * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
- if ( has_param_n == 0 ) n = m;
+ for (l=0; l<loops; l++)
+ {
- fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+ for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
+ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
- for(j = 0; j < m; j++){
- for(i = 0; i < n * COMPSIZE; i++){
- a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
- }
- }
+ for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ gettimeofday( &start, (struct timezone *)0);
+ GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+ gettimeofday( &stop, (struct timezone *)0);
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ timeg += time1;
+ }
- for (l=0; l<loops; l++)
- {
+ timeg /= loops;
- for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
- x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
- }
+ fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
- for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
- y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
- }
- gettimeofday( &start, (struct timezone *)0);
+ }
+ }
+ else
+ {
- GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+ for(n = from; n <= to; n += step)
+ {
+ timeg=0;
+ fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
+ for(j = 0; j < m; j++){
+ for(i = 0; i < n * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
- gettimeofday( &stop, (struct timezone *)0);
+ for (l=0; l<loops; l++)
+ {
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
+ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
- timeg += time1;
+ for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ gettimeofday( &start, (struct timezone *)0);
+ GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+ gettimeofday( &stop, (struct timezone *)0);
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ timeg += time1;
- }
+ }
- timeg /= loops;
+ timeg /= loops;
- fprintf(stderr,
- " %10.2f MFlops\n",
- COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+ fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
+ }
}
return 0;
diff --git a/benchmark/hemv.c b/benchmark/hemv.c
new file mode 100644
index 000000000..79b7679cc
--- /dev/null
+++ b/benchmark/hemv.c
@@ -0,0 +1,208 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef HEMV
+
+
+#ifdef DOUBLE
+#define HEMV BLASFUNC(zhemv)
+#else
+#define HEMV BLASFUNC(chemv)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int MAIN__(int argc, char *argv[]){
+
+ FLOAT *a, *x, *y;
+ FLOAT alpha[] = {1.0, 1.0};
+ FLOAT beta [] = {1.0, 1.0};
+ char uplo='L';
+ blasint m, i, j;
+ blasint inc_x=1,inc_y=1;
+ int loops = 1;
+ int l;
+ char *p;
+
+ int from = 1;
+ int to = 200;
+ int step = 1;
+
+ struct timeval start, stop;
+ double time1,timeg;
+
+ argc--;argv++;
+
+ if (argc > 0) { from = atol(*argv); argc--; argv++;}
+ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
+ if (argc > 0) { step = atol(*argv); argc--; argv++;}
+
+ if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
+ if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
+ if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
+ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
+
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
+
+ if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+ if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+ fprintf(stderr,"Out of Memory!!\n");exit(1);
+ }
+
+#ifdef linux
+ srandom(getpid());
+#endif
+
+ fprintf(stderr, " SIZE Flops\n");
+
+ for(m = from; m <= to; m += step)
+ {
+
+ timeg=0;
+
+ fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
+
+ for(j = 0; j < m; j++){
+ for(i = 0; i < m * COMPSIZE; i++){
+ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ }
+
+
+ for (l=0; l<loops; l++)
+ {
+
+ for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+
+ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
+ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+ }
+ gettimeofday( &start, (struct timezone *)0);
+
+ HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+
+ gettimeofday( &stop, (struct timezone *)0);
+
+ time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+ timeg += time1;
+
+ }
+
+ timeg /= loops;
+
+ fprintf(stderr,
+ " %10.2f MFlops\n",
+ COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
+
+ }
+
+ return 0;
+}
+
+void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/tplot-header b/benchmark/tplot-header
new file mode 100644
index 000000000..b7ce7f225
--- /dev/null
+++ b/benchmark/tplot-header
@@ -0,0 +1,42 @@
+# **********************************************************************************
+# Copyright (c) 2014, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+set term x11 font sans;
+set ylabel "MFlops";
+set xlabel "Size";
+set grid xtics;
+set grid ytics;
+set key left;
+set timestamp "generated on %Y-%m-%d by `whoami`"
+set title "Sgemv\nTRANS=T\nBulldozer"
+plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
+set output "print.png";
+show title;
+show plot;
+show output;
+
+
diff --git a/cblas.h b/cblas.h
index ef072e6ff..d1c029afa 100644
--- a/cblas.h
+++ b/cblas.h
@@ -243,8 +243,13 @@ void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+ OPENBLAS_CONST float *alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float *beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
+ OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double *beta, double *C, OPENBLAS_CONST blasint ldc);
+
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
@@ -318,6 +323,24 @@ void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *alpha, OPENBLA
void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double *beta, double *y, OPENBLAS_CONST blasint incy);
+void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a,
+ OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb);
+void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a,
+ OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
+void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a,
+ OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb);
+void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a,
+ OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb);
+
+void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a,
+ OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a,
+ OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a,
+ OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
+ OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
+
#ifdef __cplusplus
}
#endif /* __cplusplus */
diff --git a/cblas_noconst.h b/cblas_noconst.h
index 1f79e8188..bc6382513 100644
--- a/cblas_noconst.h
+++ b/cblas_noconst.h
@@ -231,8 +231,12 @@ void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
+void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
+ float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
+void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
+ double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
@@ -306,7 +310,23 @@ void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, f
void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy);
-
+void cblas_somatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a,
+ blasint clda, float *b, blasint cldb);
+void cblas_domatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a,
+ blasint clda, double *b, blasint cldb);
+void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a,
+ blasint clda, void *b, blasint cldb);
+void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a,
+ blasint clda, void *b, blasint cldb);
+
+void cblas_simatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a,
+ blasint clda, blasint cldb);
+void cblas_dimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a,
+ blasint clda, blasint cldb);
+void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float* calpha, float* a,
+ blasint clda, blasint cldb);
+void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a,
+ blasint clda, blasint cldb);
#ifdef __cplusplus
}
#endif /* __cplusplus */
diff --git a/common_param.h b/common_param.h
index 1c362e8cb..49c1bf73b 100644
--- a/common_param.h
+++ b/common_param.h
@@ -435,6 +435,9 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
+ int cgemm3m_p, cgemm3m_q, cgemm3m_r;
+ int cgemm3m_unroll_m, cgemm3m_unroll_n, cgemm3m_unroll_mn;
+
int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *);
@@ -595,6 +598,9 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *);
+ int zgemm3m_p, zgemm3m_q, zgemm3m_r;
+ int zgemm3m_unroll_m, zgemm3m_unroll_n, zgemm3m_unroll_mn;
+
int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *);
@@ -757,6 +763,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *);
+ int xgemm3m_p, xgemm3m_q, xgemm3m_r;
+ int xgemm3m_unroll_m, xgemm3m_unroll_n, xgemm3m_unroll_mn;
+
int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *);
@@ -900,6 +909,27 @@ extern gotoblas_t *gotoblas;
#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n
#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn
+#define CGEMM3M_P gotoblas -> cgemm3m_p
+#define CGEMM3M_Q gotoblas -> cgemm3m_q
+#define CGEMM3M_R gotoblas -> cgemm3m_r
+#define CGEMM3M_UNROLL_M gotoblas -> cgemm3m_unroll_m
+#define CGEMM3M_UNROLL_N gotoblas -> cgemm3m_unroll_n
+#define CGEMM3M_UNROLL_MN gotoblas -> cgemm3m_unroll_mn
+
+#define ZGEMM3M_P gotoblas -> zgemm3m_p
+#define ZGEMM3M_Q gotoblas -> zgemm3m_q
+#define ZGEMM3M_R gotoblas -> zgemm3m_r
+#define ZGEMM3M_UNROLL_M gotoblas -> zgemm3m_unroll_m
+#define ZGEMM3M_UNROLL_N gotoblas -> zgemm3m_unroll_n
+#define ZGEMM3M_UNROLL_MN gotoblas -> zgemm3m_unroll_mn
+
+#define XGEMM3M_P gotoblas -> xgemm3m_p
+#define XGEMM3M_Q gotoblas -> xgemm3m_q
+#define XGEMM3M_R gotoblas -> xgemm3m_r
+#define XGEMM3M_UNROLL_M gotoblas -> xgemm3m_unroll_m
+#define XGEMM3M_UNROLL_N gotoblas -> xgemm3m_unroll_n
+#define XGEMM3M_UNROLL_MN gotoblas -> xgemm3m_unroll_mn
+
#else
#define DTB_ENTRIES DTB_DEFAULT_ENTRIES
@@ -972,6 +1002,55 @@ extern gotoblas_t *gotoblas;
#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N
#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N))
+#ifdef CGEMM3M_DEFAULT_UNROLL_N
+
+#define CGEMM3M_P CGEMM3M_DEFAULT_P
+#define CGEMM3M_Q CGEMM3M_DEFAULT_Q
+#define CGEMM3M_R CGEMM3M_DEFAULT_R
+#define CGEMM3M_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M
+#define CGEMM3M_UNROLL_N CGEMM3M_DEFAULT_UNROLL_N
+#define CGEMM3M_UNROLL_MN MAX((CGEMM3M_UNROLL_M), (CGEMM3M_UNROLL_N))
+
+#else
+
+#define CGEMM3M_P SGEMM_DEFAULT_P
+#define CGEMM3M_Q SGEMM_DEFAULT_Q
+#define CGEMM3M_R SGEMM_DEFAULT_R
+#define CGEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M
+#define CGEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N
+#define CGEMM3M_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N))
+
+#endif
+
+
+#ifdef ZGEMM3M_DEFAULT_UNROLL_N
+
+#define ZGEMM3M_P ZGEMM3M_DEFAULT_P
+#define ZGEMM3M_Q ZGEMM3M_DEFAULT_Q
+#define ZGEMM3M_R ZGEMM3M_DEFAULT_R
+#define ZGEMM3M_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M
+#define ZGEMM3M_UNROLL_N ZGEMM3M_DEFAULT_UNROLL_N
+#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
+
+#else
+
+#define ZGEMM3M_P DGEMM_DEFAULT_P
+#define ZGEMM3M_Q DGEMM_DEFAULT_Q
+#define ZGEMM3M_R DGEMM_DEFAULT_R
+#define ZGEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M
+#define ZGEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N
+#define ZGEMM3M_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N))
+
+#endif
+
+#define XGEMM3M_P QGEMM_DEFAULT_P
+#define XGEMM3M_Q QGEMM_DEFAULT_Q
+#define XGEMM3M_R QGEMM_DEFAULT_R
+#define XGEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M
+#define XGEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N
+#define XGEMM3M_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N))
+
+
#endif
#endif
@@ -1054,14 +1133,14 @@ extern gotoblas_t *gotoblas;
#endif
#ifdef XDOUBLE
-#define GEMM3M_UNROLL_M QGEMM_UNROLL_M
-#define GEMM3M_UNROLL_N QGEMM_UNROLL_N
+#define GEMM3M_UNROLL_M XGEMM3M_UNROLL_M
+#define GEMM3M_UNROLL_N XGEMM3M_UNROLL_N
#elif defined(DOUBLE)
-#define GEMM3M_UNROLL_M DGEMM_UNROLL_M
-#define GEMM3M_UNROLL_N DGEMM_UNROLL_N
+#define GEMM3M_UNROLL_M ZGEMM3M_UNROLL_M
+#define GEMM3M_UNROLL_N ZGEMM3M_UNROLL_N
#else
-#define GEMM3M_UNROLL_M SGEMM_UNROLL_M
-#define GEMM3M_UNROLL_N SGEMM_UNROLL_N
+#define GEMM3M_UNROLL_M CGEMM3M_UNROLL_M
+#define GEMM3M_UNROLL_N CGEMM3M_UNROLL_N
#endif
@@ -1123,31 +1202,31 @@ extern gotoblas_t *gotoblas;
#ifndef GEMM3M_P
#ifdef XDOUBLE
-#define GEMM3M_P QGEMM_P
+#define GEMM3M_P XGEMM3M_P
#elif defined(DOUBLE)
-#define GEMM3M_P DGEMM_P
+#define GEMM3M_P ZGEMM3M_P
#else
-#define GEMM3M_P SGEMM_P
+#define GEMM3M_P CGEMM3M_P
#endif
#endif
#ifndef GEMM3M_Q
#ifdef XDOUBLE
-#define GEMM3M_Q QGEMM_Q
+#define GEMM3M_Q XGEMM3M_Q
#elif defined(DOUBLE)
-#define GEMM3M_Q DGEMM_Q
+#define GEMM3M_Q ZGEMM3M_Q
#else
-#define GEMM3M_Q SGEMM_Q
+#define GEMM3M_Q CGEMM3M_Q
#endif
#endif
#ifndef GEMM3M_R
#ifdef XDOUBLE
-#define GEMM3M_R QGEMM_R
+#define GEMM3M_R XGEMM3M_R
#elif defined(DOUBLE)
-#define GEMM3M_R DGEMM_R
+#define GEMM3M_R ZGEMM3M_R
#else
-#define GEMM3M_R SGEMM_R
+#define GEMM3M_R CGEMM3M_R
#endif
#endif
diff --git a/common_x86_64.h b/common_x86_64.h
index 0f842ee94..547614f74 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -46,6 +46,7 @@
#define __volatile__
#endif
+/*
#ifdef HAVE_SSE2
#define MB __asm__ __volatile__ ("mfence");
#define WMB __asm__ __volatile__ ("sfence");
@@ -53,6 +54,10 @@
#define MB
#define WMB
#endif
+*/
+
+#define MB
+#define WMB
static void __inline blas_lock(volatile BLASULONG *address){
@@ -99,7 +104,9 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
: "0" (op));
}
+/*
#define WHEREAMI
+*/
static inline int WhereAmI(void){
int eax, ebx, ecx, edx;
@@ -111,6 +118,7 @@ static inline int WhereAmI(void){
return apicid;
}
+
#ifdef CORE_BARCELONA
#define IFLUSH gotoblas_iflush()
#define IFLUSH_HALF gotoblas_iflush_half()
diff --git a/cpuid_x86.c b/cpuid_x86.c
index 53016e1e7..f9df7221b 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -59,9 +59,16 @@
void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx);
#else
static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
+#if defined(__i386__) && defined(__PIC__)
+ __asm__ __volatile__
+ ("mov %%ebx, %%edi;"
+ "cpuid;"
+ "xchgl %%ebx, %%edi;"
+ : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
+#else
__asm__ __volatile__
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
-
+#endif
}
#endif
diff --git a/ctest/Makefile b/ctest/Makefile
index 70d3f9712..1d9567150 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -74,6 +74,18 @@ else
OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3
endif
+all3_3m: xzcblat3_3m xccblat3_3m
+ifeq ($(USE_OPENMP), 1)
+ OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
+ OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
+else
+ OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
+ OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
+endif
+
+
+
+
clean ::
rm -f x*
@@ -103,6 +115,9 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+xccblat3_3m: $(ctestl3o) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
+ $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+
# Double complex
xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
@@ -111,4 +126,9 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+
+xzcblat3_3m: $(ztestl3o) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
+ $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+
+
include $(TOPDIR)/Makefile.tail
diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c
index 1c133fb9b..4d5de5150 100644
--- a/ctest/c_c3chke.c
+++ b/ctest/c_c3chke.c
@@ -45,8 +45,238 @@ void F77_c3chke(char * rout) {
F77_xerbla(cblas_rout,&cblas_info);
}
- if (strncmp( sf,"cblas_cgemm" ,11)==0) {
- cblas_rout = "cblas_cgemm" ;
+
+ if (strncmp( sf,"cblas_cgemm3m" ,13)==0) {
+ cblas_rout = "cblas_cgemm3" ;
+
+ cblas_info = 1;
+ cblas_cgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 1;
+ cblas_cgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 1;
+ cblas_cgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 1;
+ cblas_cgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 2; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 2; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 3; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 3; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_cgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 2, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_cgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+
+ } else if (strncmp( sf,"cblas_cgemm" ,11)==0) {
+ cblas_rout = "cblas_cgemm" ;
+
cblas_info = 1;
cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
diff --git a/ctest/c_cblas3.c b/ctest/c_cblas3.c
index 0b2f6b966..f1b108c64 100644
--- a/ctest/c_cblas3.c
+++ b/ctest/c_cblas3.c
@@ -88,6 +88,7 @@ void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n,
cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
b, *ldb, beta, c, *ldc );
}
+
void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n,
CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
@@ -563,3 +564,84 @@ void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
+
+
+
+void F77_cgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
+ int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda,
+ CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta,
+ CBLAS_TEST_COMPLEX *c, int *ldc ) {
+
+ CBLAS_TEST_COMPLEX *A, *B, *C;
+ int i,j,LDA, LDB, LDC;
+ enum CBLAS_TRANSPOSE transa, transb;
+
+ get_transpose_type(transpa, &transa);
+ get_transpose_type(transpb, &transb);
+
+ if (*order == TEST_ROW_MJR) {
+ if (transa == CblasNoTrans) {
+ LDA = *k+1;
+ A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
+ for( i=0; i<*m; i++ )
+ for( j=0; j<*k; j++ ) {
+ A[i*LDA+j].real=a[j*(*lda)+i].real;
+ A[i*LDA+j].imag=a[j*(*lda)+i].imag;
+ }
+ }
+ else {
+ LDA = *m+1;
+ A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX));
+ for( i=0; i<*k; i++ )
+ for( j=0; j<*m; j++ ) {
+ A[i*LDA+j].real=a[j*(*lda)+i].real;
+ A[i*LDA+j].imag=a[j*(*lda)+i].imag;
+ }
+ }
+
+ if (transb == CblasNoTrans) {
+ LDB = *n+1;
+ B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) );
+ for( i=0; i<*k; i++ )
+ for( j=0; j<*n; j++ ) {
+ B[i*LDB+j].real=b[j*(*ldb)+i].real;
+ B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
+ }
+ }
+ else {
+ LDB = *k+1;
+ B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX));
+ for( i=0; i<*n; i++ )
+ for( j=0; j<*k; j++ ) {
+ B[i*LDB+j].real=b[j*(*ldb)+i].real;
+ B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
+ }
+ }
+
+ LDC = *n+1;
+ C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX));
+ for( j=0; j<*n; j++ )
+ for( i=0; i<*m; i++ ) {
+ C[i*LDC+j].real=c[j*(*ldc)+i].real;
+ C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
+ }
+ cblas_cgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
+ B, LDB, beta, C, LDC );
+ for( j=0; j<*n; j++ )
+ for( i=0; i<*m; i++ ) {
+ c[j*(*ldc)+i].real=C[i*LDC+j].real;
+ c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
+ }
+ free(A);
+ free(B);
+ free(C);
+ }
+ else if (*order == TEST_COL_MJR)
+ cblas_cgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
+ b, *ldb, beta, c, *ldc );
+ else
+ cblas_cgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
+ b, *ldb, beta, c, *ldc );
+}
+
+
diff --git a/ctest/c_cblat3_3m.f b/ctest/c_cblat3_3m.f
new file mode 100644
index 000000000..68dd49859
--- /dev/null
+++ b/ctest/c_cblat3_3m.f
@@ -0,0 +1,2786 @@
+ PROGRAM CBLAT3
+*
+* Test program for the COMPLEX Level 3 Blas.
+*
+* The program must be driven by a short data file. The first 13 records
+* of the file are read using list-directed input, the last 9 records
+* are read using the format ( A13, L2 ). An annotated example of a data
+* file can be obtained by deleting the first 3 characters from the
+* following 22 lines:
+* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+* F LOGICAL FLAG, T TO STOP ON FAILURES.
+* T LOGICAL FLAG, T TO TEST ERROR EXITS.
+* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
+* 16.0 THRESHOLD VALUE OF TEST RATIO
+* 6 NUMBER OF VALUES OF N
+* 0 1 2 3 5 9 VALUES OF N
+* 3 NUMBER OF VALUES OF ALPHA
+* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+* 3 NUMBER OF VALUES OF BETA
+* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+* cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS.
+* cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS.
+*
+* See:
+*
+* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+* A Set of Level 3 Basic Linear Algebra Subprograms.
+*
+* Technical Memorandum No.88 (Revision 1), Mathematics and
+* Computer Science Division, Argonne National Laboratory, 9700
+* South Cass Avenue, Argonne, Illinois 60439, US.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ INTEGER NIN, NOUT
+ PARAMETER ( NIN = 5, NOUT = 6 )
+ INTEGER NSUBS
+ PARAMETER ( NSUBS = 9 )
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ REAL RZERO, RHALF, RONE
+ PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+ INTEGER NMAX
+ PARAMETER ( NMAX = 65 )
+ INTEGER NIDMAX, NALMAX, NBEMAX
+ PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+* .. Local Scalars ..
+ REAL EPS, ERR, THRESH
+ INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA,
+ $ LAYOUT
+ LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+ $ TSTERR, CORDER, RORDER
+ CHARACTER*1 TRANSA, TRANSB
+ CHARACTER*13 SNAMET
+ CHARACTER*32 SNAPS
+* .. Local Arrays ..
+ COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+ $ ALF( NALMAX ), AS( NMAX*NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBEMAX ),
+ $ BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDMAX )
+ LOGICAL LTEST( NSUBS )
+ CHARACTER*13 SNAMES( NSUBS )
+* .. External Functions ..
+ REAL SDIFF
+ LOGICAL LCE
+ EXTERNAL SDIFF, LCE
+* .. External Subroutines ..
+ EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+ CHARACTER*13 SRNAMT
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+ COMMON /SRNAMC/SRNAMT
+* .. Data statements ..
+ DATA SNAMES/'cblas_cgemm3m ', 'cblas_chemm ',
+ $ 'cblas_csymm ', 'cblas_ctrmm ', 'cblas_ctrsm ',
+ $ 'cblas_cherk ', 'cblas_csyrk ', 'cblas_cher2k',
+ $ 'cblas_csyr2k'/
+* .. Executable Statements ..
+*
+ NOUTC = NOUT
+*
+* Read name and unit number for snapshot output file and open file.
+*
+ READ( NIN, FMT = * )SNAPS
+ READ( NIN, FMT = * )NTRA
+ TRACE = NTRA.GE.0
+ IF( TRACE )THEN
+ OPEN( NTRA, FILE = SNAPS )
+ END IF
+* Read the flag that directs rewinding of the snapshot file.
+ READ( NIN, FMT = * )REWI
+ REWI = REWI.AND.TRACE
+* Read the flag that directs stopping on any failure.
+ READ( NIN, FMT = * )SFATAL
+* Read the flag that indicates whether error exits are to be tested.
+ READ( NIN, FMT = * )TSTERR
+* Read the flag that indicates whether row-major data layout to be tested.
+ READ( NIN, FMT = * )LAYOUT
+* Read the threshold value of the test ratio
+ READ( NIN, FMT = * )THRESH
+*
+* Read and check the parameter values for the tests.
+*
+* Values of N
+ READ( NIN, FMT = * )NIDIM
+ IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+ DO 10 I = 1, NIDIM
+ IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+ WRITE( NOUT, FMT = 9996 )NMAX
+ GO TO 220
+ END IF
+ 10 CONTINUE
+* Values of ALPHA
+ READ( NIN, FMT = * )NALF
+ IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+* Values of BETA
+ READ( NIN, FMT = * )NBET
+ IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+* Report values of parameters.
+*
+ WRITE( NOUT, FMT = 9995 )
+ WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+ WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+ WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+ IF( .NOT.TSTERR )THEN
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9984 )
+ END IF
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9999 )THRESH
+ WRITE( NOUT, FMT = * )
+
+ RORDER = .FALSE.
+ CORDER = .FALSE.
+ IF (LAYOUT.EQ.2) THEN
+ RORDER = .TRUE.
+ CORDER = .TRUE.
+ WRITE( *, FMT = 10002 )
+ ELSE IF (LAYOUT.EQ.1) THEN
+ RORDER = .TRUE.
+ WRITE( *, FMT = 10001 )
+ ELSE IF (LAYOUT.EQ.0) THEN
+ CORDER = .TRUE.
+ WRITE( *, FMT = 10000 )
+ END IF
+ WRITE( *, FMT = * )
+
+*
+* Read names of subroutines and flags which indicate
+* whether they are to be tested.
+*
+ DO 20 I = 1, NSUBS
+ LTEST( I ) = .FALSE.
+ 20 CONTINUE
+ 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+ DO 40 I = 1, NSUBS
+ IF( SNAMET.EQ.SNAMES( I ) )
+ $ GO TO 50
+ 40 CONTINUE
+ WRITE( NOUT, FMT = 9990 )SNAMET
+ STOP
+ 50 LTEST( I ) = LTESTT
+ GO TO 30
+*
+ 60 CONTINUE
+ CLOSE ( NIN )
+*
+* Compute EPS (the machine precision).
+*
+ EPS = RONE
+ 70 CONTINUE
+ IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
+ $ GO TO 80
+ EPS = RHALF*EPS
+ GO TO 70
+ 80 CONTINUE
+ EPS = EPS + EPS
+ WRITE( NOUT, FMT = 9998 )EPS
+*
+* Check the reliability of CMMCH using exact data.
+*
+ N = MIN( 32, NMAX )
+ DO 100 J = 1, N
+ DO 90 I = 1, N
+ AB( I, J ) = MAX( I - J + 1, 0 )
+ 90 CONTINUE
+ AB( J, NMAX + 1 ) = J
+ AB( 1, NMAX + J ) = J
+ C( J, 1 ) = ZERO
+ 100 CONTINUE
+ DO 110 J = 1, N
+ CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+ 110 CONTINUE
+* CC holds the exact result. On exit from CMMCH CT holds
+* the result computed by CMMCH.
+ TRANSA = 'N'
+ TRANSB = 'N'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ DO 120 J = 1, N
+ AB( J, NMAX + 1 ) = N - J + 1
+ AB( 1, NMAX + J ) = N - J + 1
+ 120 CONTINUE
+ DO 130 J = 1, N
+ CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+ $ ( ( J + 1 )*J*( J - 1 ) )/3
+ 130 CONTINUE
+ TRANSA = 'C'
+ TRANSB = 'N'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+*
+* Test each subroutine in turn.
+*
+ DO 200 ISNUM = 1, NSUBS
+ WRITE( NOUT, FMT = * )
+ IF( .NOT.LTEST( ISNUM ) )THEN
+* Subprogram is not to be tested.
+ WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+ ELSE
+ SRNAMT = SNAMES( ISNUM )
+* Test error exits.
+ IF( TSTERR )THEN
+ CALL CC3CHKE( SNAMES( ISNUM ) )
+ WRITE( NOUT, FMT = * )
+ END IF
+* Test computations.
+ INFOT = 0
+ OK = .TRUE.
+ FATAL = .FALSE.
+ GO TO ( 140, 150, 150, 160, 160, 170, 170,
+ $ 180, 180 )ISNUM
+* Test CGEMM, 01.
+ 140 IF (CORDER) THEN
+ CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 1 )
+ END IF
+ GO TO 190
+* Test CHEMM, 02, CSYMM, 03.
+ 150 IF (CORDER) THEN
+ CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 1 )
+ END IF
+ GO TO 190
+* Test CTRMM, 04, CTRSM, 05.
+ 160 IF (CORDER) THEN
+ CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+ $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C,
+ $ 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+ $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C,
+ $ 1 )
+ END IF
+ GO TO 190
+* Test CHERK, 06, CSYRK, 07.
+ 170 IF (CORDER) THEN
+ CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 1 )
+ END IF
+ GO TO 190
+* Test CHER2K, 08, CSYR2K, 09.
+ 180 IF (CORDER) THEN
+ CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
+ $ 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
+ $ 1 )
+ END IF
+ GO TO 190
+*
+ 190 IF( FATAL.AND.SFATAL )
+ $ GO TO 210
+ END IF
+ 200 CONTINUE
+ WRITE( NOUT, FMT = 9986 )
+ GO TO 230
+*
+ 210 CONTINUE
+ WRITE( NOUT, FMT = 9985 )
+ GO TO 230
+*
+ 220 CONTINUE
+ WRITE( NOUT, FMT = 9991 )
+*
+ 230 CONTINUE
+ IF( TRACE )
+ $ CLOSE ( NTRA )
+ CLOSE ( NOUT )
+ STOP
+*
+10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
+10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
+10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' )
+ 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+ $ 'S THAN', F8.2 )
+ 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 )
+ 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+ $ 'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT(' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F',
+ $ 'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( ' FOR N ', 9I6 )
+ 9993 FORMAT( ' FOR ALPHA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9992 FORMAT( ' FOR BETA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+ $ /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT(' SUBPROGRAM NAME ', A13,' NOT RECOGNIZED', /' ******* T',
+ $ 'ESTS ABANDONED *******' )
+ 9989 FORMAT(' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU',
+ $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1,
+ $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+ $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+ $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+ $ '*******' )
+ 9988 FORMAT( A13,L2 )
+ 9987 FORMAT( 1X, A13,' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+* End of CBLAT3.
+*
+ END
+ SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ IORDER )
+*
+* Tests CGEMM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BLS
+ REAL ERR, ERRMAX
+ INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+ $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+ $ MA, MB, MS, N, NA, NARGS, NB, NC, NS
+ LOGICAL NULL, RESET, SAME, TRANA, TRANB
+ CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB
+ CHARACTER*3 ICH
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CCGEMM3M, CMAKE, CMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICH/'NTC'/
+* .. Executable Statements ..
+*
+ NARGS = 13
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 110 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICA = 1, 3
+ TRANSA = ICH( ICA: ICA )
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+ IF( TRANA )THEN
+ MA = K
+ NA = M
+ ELSE
+ MA = M
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICB = 1, 3
+ TRANSB = ICH( ICB: ICB )
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+ IF( TRANB )THEN
+ MB = N
+ NB = K
+ ELSE
+ MB = K
+ NB = N
+ END IF
+* Set LDB to 1 more than minimum value if room.
+ LDB = MB
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 70
+ LBB = LDB*NB
+*
+* Generate the matrix B.
+*
+ CALL CMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB,
+ $ LDB, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX,
+ $ CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ TRANAS = TRANSA
+ TRANBS = TRANSB
+ MS = M
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ CALL CPRCN1(NTRA, NC, SNAME, IORDER,
+ $ TRANSA, TRANSB, M, N, K, ALPHA, LDA,
+ $ LDB, BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CCGEMM3M( IORDER, TRANSA, TRANSB, M, N,
+ $ K, ALPHA, AA, LDA, BB, LDB,
+ $ BETA, CC, LDC )
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = TRANSA.EQ.TRANAS
+ ISAME( 2 ) = TRANSB.EQ.TRANBS
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = KS.EQ.K
+ ISAME( 6 ) = ALS.EQ.ALPHA
+ ISAME( 7 ) = LCE( AS, AA, LAA )
+ ISAME( 8 ) = LDAS.EQ.LDA
+ ISAME( 9 ) = LCE( BS, BB, LBB )
+ ISAME( 10 ) = LDBS.EQ.LDB
+ ISAME( 11 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 12 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 12 ) = LCERES( 'ge', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 13 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report
+* and return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ CALL CMMCH( TRANSA, TRANSB, M, N, K,
+ $ ALPHA, A, NMAX, B, NMAX, BETA,
+ $ C, NMAX, CT, G, CC, LDC, EPS,
+ $ ERR, FATAL, NOUT, .TRUE. )
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 120
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
+ $ M, N, K, ALPHA, LDA, LDB, BETA, LDC)
+*
+ 130 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A13,'(''', A1, ''',''', A1, ''',',
+ $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3,
+ $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' )
+ 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK1.
+*
+ END
+*
+ SUBROUTINE CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N,
+ $ K, ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC
+ COMPLEX ALPHA, BETA
+ CHARACTER*1 TRANSA, TRANSB
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CTA,CTB
+
+ IF (TRANSA.EQ.'N')THEN
+ CTA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CTA = ' CblasTrans'
+ ELSE
+ CTA = 'CblasConjTrans'
+ END IF
+ IF (TRANSB.EQ.'N')THEN
+ CTB = ' CblasNoTrans'
+ ELSE IF (TRANSB.EQ.'T')THEN
+ CTB = ' CblasTrans'
+ ELSE
+ CTB = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB
+ WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',')
+ 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,',
+ $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' )
+ END
+*
+ SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ IORDER )
+*
+* Tests CHEMM and CSYMM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BLS
+ REAL ERR, ERRMAX
+ INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+ $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, LEFT, NULL, RESET, SAME
+ CHARACTER*1 SIDE, SIDES, UPLO, UPLOS
+ CHARACTER*2 ICHS, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CCHEMM, CMAKE, CMMCH, CCSYMM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHS/'LR'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 8: 9 ).EQ.'he'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 100 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 90 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 90
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 90
+ LBB = LDB*N
+*
+* Generate the matrix B.
+*
+ CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+ $ ZERO )
+*
+ DO 80 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+*
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+* Generate the hermitian or symmetric matrix A.
+*
+ CALL CMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX,
+ $ AA, LDA, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC,
+ $ LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ CALL CPRCN2(NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
+ $ BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ IF( CONJ )THEN
+ CALL CCHEMM( IORDER, SIDE, UPLO, M, N,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ CC, LDC )
+ ELSE
+ CALL CCSYMM( IORDER, SIDE, UPLO, M, N,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LCE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LCE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ ISAME( 10 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 11 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LCERES( 'ge', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A,
+ $ NMAX, B, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B,
+ $ NMAX, A, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 120
+*
+ 110 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA,
+ $ LDB, BETA, LDC)
+*
+ 120 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK2.
+*
+ END
+*
+ SUBROUTINE CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N,
+ $ ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC
+ COMPLEX ALPHA, BETA
+ CHARACTER*1 SIDE, UPLO
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CS,CU
+
+ IF (SIDE.EQ.'L')THEN
+ CS = ' CblasLeft'
+ ELSE
+ CS = ' CblasRight'
+ END IF
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
+ WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',')
+ 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3,
+ $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' )
+ END
+*
+ SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+ $ B, BB, BS, CT, G, C, IORDER )
+*
+* Tests CTRMM and CTRSM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS
+ REAL ERR, ERRMAX
+ INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+ $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+ $ NS
+ LOGICAL LEFT, NULL, RESET, SAME
+ CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+ $ UPLOS
+ CHARACTER*2 ICHD, ICHS, ICHU
+ CHARACTER*3 ICHT
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CMAKE, CMMCH, CCTRMM, CCTRSM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+* .. Executable Statements ..
+*
+ NARGS = 11
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+* Set up zero matrix for CMMCH.
+ DO 20 J = 1, NMAX
+ DO 10 I = 1, NMAX
+ C( I, J ) = ZERO
+ 10 CONTINUE
+ 20 CONTINUE
+*
+ DO 140 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 130
+ LBB = LDB*N
+ NULL = M.LE.0.OR.N.LE.0
+*
+ DO 120 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 130
+ LAA = LDA*NA
+*
+ DO 110 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+ DO 100 ICT = 1, 3
+ TRANSA = ICHT( ICT: ICT )
+*
+ DO 90 ICD = 1, 2
+ DIAG = ICHD( ICD: ICD )
+*
+ DO 80 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+* Generate the matrix A.
+*
+ CALL CMAKE( 'tr', UPLO, DIAG, NA, NA, A,
+ $ NMAX, AA, LDA, RESET, ZERO )
+*
+* Generate the matrix B.
+*
+ CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX,
+ $ BB, LDB, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ TRANAS = TRANSA
+ DIAGS = DIAG
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 30 I = 1, LAA
+ AS( I ) = AA( I )
+ 30 CONTINUE
+ LDAS = LDA
+ DO 40 I = 1, LBB
+ BS( I ) = BB( I )
+ 40 CONTINUE
+ LDBS = LDB
+*
+* Call the subroutine.
+*
+ IF( SNAME( 10: 11 ).EQ.'mm' )THEN
+ IF( TRACE )
+ $ CALL CPRCN3( NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CCTRMM(IORDER, SIDE, UPLO, TRANSA,
+ $ DIAG, M, N, ALPHA, AA, LDA,
+ $ BB, LDB )
+ ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN
+ IF( TRACE )
+ $ CALL CPRCN3( NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CCTRSM(IORDER, SIDE, UPLO, TRANSA,
+ $ DIAG, M, N, ALPHA, AA, LDA,
+ $ BB, LDB )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = TRANAS.EQ.TRANSA
+ ISAME( 4 ) = DIAGS.EQ.DIAG
+ ISAME( 5 ) = MS.EQ.M
+ ISAME( 6 ) = NS.EQ.N
+ ISAME( 7 ) = ALS.EQ.ALPHA
+ ISAME( 8 ) = LCE( AS, AA, LAA )
+ ISAME( 9 ) = LDAS.EQ.LDA
+ IF( NULL )THEN
+ ISAME( 10 ) = LCE( BS, BB, LBB )
+ ELSE
+ ISAME( 10 ) = LCERES( 'ge', ' ', M, N, BS,
+ $ BB, LDB )
+ END IF
+ ISAME( 11 ) = LDBS.EQ.LDB
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 50 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 50 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+ IF( SNAME( 10: 11 ).EQ.'mm' )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL CMMCH( TRANSA, 'N', M, N, M,
+ $ ALPHA, A, NMAX, B, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL CMMCH( 'N', TRANSA, M, N, N,
+ $ ALPHA, B, NMAX, A, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN
+*
+* Compute approximation to original
+* matrix.
+*
+ DO 70 J = 1, N
+ DO 60 I = 1, M
+ C( I, J ) = BB( I + ( J - 1 )*
+ $ LDB )
+ BB( I + ( J - 1 )*LDB ) = ALPHA*
+ $ B( I, J )
+ 60 CONTINUE
+ 70 CONTINUE
+*
+ IF( LEFT )THEN
+ CALL CMMCH( TRANSA, 'N', M, N, M,
+ $ ONE, A, NMAX, C, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ ELSE
+ CALL CMMCH( 'N', TRANSA, M, N, N,
+ $ ONE, C, NMAX, A, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ END IF
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 150
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+ 140 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
+ $ M, N, ALPHA, LDA, LDB)
+*
+ 160 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT(' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT(1X, I6, ': ', A13,'(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ',
+ $ ' .' )
+ 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK3.
+*
+ END
+*
+ SUBROUTINE CPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA,
+ $ DIAG, M, N, ALPHA, LDA, LDB)
+ INTEGER NOUT, NC, IORDER, M, N, LDA, LDB
+ COMPLEX ALPHA
+ CHARACTER*1 SIDE, UPLO, TRANSA, DIAG
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CS, CU, CA, CD
+
+ IF (SIDE.EQ.'L')THEN
+ CS = ' CblasLeft'
+ ELSE
+ CS = ' CblasRight'
+ END IF
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (DIAG.EQ.'N')THEN
+ CD = ' CblasNonUnit'
+ ELSE
+ CD = ' CblasUnit'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
+ WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', A14, ',', A14, ',', A14, ',')
+ 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',',
+ $ F4.1, '), A,', I3, ', B,', I3, ').' )
+ END
+*
+ SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ IORDER )
+*
+* Tests CHERK and CSYRK.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RONE, RZERO
+ PARAMETER ( RONE = 1.0, RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BETS
+ REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+ $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CCHERK, CMAKE, CMMCH, CCSYRK
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX, MAX, REAL
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 8: 9 ).EQ.'he'
+*
+ NARGS = 10
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+ IF( CONJ )THEN
+ RALPHA = REAL( ALPHA )
+ ALPHA = CMPLX( RALPHA, RZERO )
+ END IF
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = REAL( BETA )
+ BETA = CMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ.
+ $ RZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ IF( CONJ )THEN
+ RALS = RALPHA
+ ELSE
+ ALS = ALPHA
+ END IF
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 20 I = 1, LCC
+ CS( I ) = CC( I )
+ 20 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ CALL CPRCN6( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA,
+ $ LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CCHERK( IORDER, UPLO, TRANS, N, K,
+ $ RALPHA, AA, LDA, RBETA, CC,
+ $ LDC )
+ ELSE
+ IF( TRACE )
+ $ CALL CPRCN4( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CCSYRK( IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, AA, LDA, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ IF( CONJ )THEN
+ ISAME( 5 ) = RALS.EQ.RALPHA
+ ELSE
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ END IF
+ ISAME( 6 ) = LCE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ IF( CONJ )THEN
+ ISAME( 8 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 8 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 9 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 9 ) = LCERES( SNAME( 8: 9 ), UPLO, N,
+ $ N, CS, CC, LDC )
+ END IF
+ ISAME( 10 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 30 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 30 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JC = 1
+ DO 40 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ CALL CMMCH( TRANST, 'N', LJ, 1, K,
+ $ ALPHA, A( 1, JJ ), NMAX,
+ $ A( 1, J ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL CMMCH( 'N', TRANST, LJ, 1, K,
+ $ ALPHA, A( JJ, 1 ), NMAX,
+ $ A( J, 1 ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ 40 CONTINUE
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 110 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ CALL CPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA,
+ $ LDA, rBETA, LDC)
+ ELSE
+ CALL CPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, BETA, LDC)
+ END IF
+*
+ 130 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ',
+ $ ' .' )
+ 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1,
+ $ '), C,', I3, ') .' )
+ 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK4.
+*
+ END
+*
+ SUBROUTINE CPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDC
+ COMPLEX ALPHA, BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,',
+ $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' )
+ END
+*
+*
+ SUBROUTINE CPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDC
+ REAL ALPHA, BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ),
+ $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' )
+ END
+*
+ SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
+ $ IORDER )
+*
+* Tests CHER2K and CSYR2K.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ REAL RONE, RZERO
+ PARAMETER ( RONE = 1.0, RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+ $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+ $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BETS
+ REAL ERR, ERRMAX, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+ $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+ $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CCHER2K, CMAKE, CMMCH, CCSYR2K
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX, CONJG, MAX, REAL
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 8: 9 ).EQ.'he'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 130
+ LCC = LDC*N
+*
+ DO 120 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 110 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 110
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ IF( TRAN )THEN
+ CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+ $ LDA, RESET, ZERO )
+ ELSE
+ CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+ $ RESET, ZERO )
+ END IF
+*
+* Generate the matrix B.
+*
+ LDB = LDA
+ LBB = LAA
+ IF( TRAN )THEN
+ CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ),
+ $ 2*NMAX, BB, LDB, RESET, ZERO )
+ ELSE
+ CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+ $ NMAX, BB, LDB, RESET, ZERO )
+ END IF
+*
+ DO 100 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 90 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 80 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = REAL( BETA )
+ BETA = CMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ.
+ $ ZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ CALL CPRCN7( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, ALPHA, LDA, LDB,
+ $ RBETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CCHER2K( IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, AA, LDA, BB, LDB, RBETA,
+ $ CC, LDC )
+ ELSE
+ IF( TRACE )
+ $ CALL CPRCN5( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, ALPHA, LDA, LDB,
+ $ BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CCSYR2K( IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LCE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LCE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ IF( CONJ )THEN
+ ISAME( 10 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 10 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 11 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LCERES( 'he', UPLO, N, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JJAB = 1
+ JC = 1
+ DO 70 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ DO 50 I = 1, K
+ W( I ) = ALPHA*AB( ( J - 1 )*2*
+ $ NMAX + K + I )
+ IF( CONJ )THEN
+ W( K + I ) = CONJG( ALPHA )*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ ELSE
+ W( K + I ) = ALPHA*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ END IF
+ 50 CONTINUE
+ CALL CMMCH( TRANST, 'N', LJ, 1, 2*K,
+ $ ONE, AB( JJAB ), 2*NMAX, W,
+ $ 2*NMAX, BETA, C( JJ, J ),
+ $ NMAX, CT, G, CC( JC ), LDC,
+ $ EPS, ERR, FATAL, NOUT,
+ $ .TRUE. )
+ ELSE
+ DO 60 I = 1, K
+ IF( CONJ )THEN
+ W( I ) = ALPHA*CONJG( AB( ( K +
+ $ I - 1 )*NMAX + J ) )
+ W( K + I ) = CONJG( ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J ) )
+ ELSE
+ W( I ) = ALPHA*AB( ( K + I - 1 )*
+ $ NMAX + J )
+ W( K + I ) = ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J )
+ END IF
+ 60 CONTINUE
+ CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE,
+ $ AB( JJ ), NMAX, W, 2*NMAX,
+ $ BETA, C( JJ, J ), NMAX, CT,
+ $ G, CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ IF( TRAN )
+ $ JJAB = JJAB + 2*NMAX
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 140
+ 70 CONTINUE
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 140 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ CALL CPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, LDA, LDB, RBETA, LDC)
+ ELSE
+ CALL CPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, LDA, LDB, BETA, LDC)
+ END IF
+*
+ 160 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1,
+ $ ', C,', I3, ') .' )
+ 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK5.
+*
+ END
+*
+ SUBROUTINE CPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC
+ COMPLEX ALPHA, BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,',
+ $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' )
+ END
+*
+*
+ SUBROUTINE CPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC
+ COMPLEX ALPHA
+ REAL BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A14, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,',
+ $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' )
+ END
+*
+ SUBROUTINE CMAKE(TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+ $ TRANSL )
+*
+* Generates values for an M by N matrix A.
+* Stores the values in the array AA in the data structure required
+* by the routine, with unwanted elements set to rogue value.
+*
+* TYPE is 'ge', 'he', 'sy' or 'tr'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ COMPLEX ROGUE
+ PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+ REAL RROGUE
+ PARAMETER ( RROGUE = -1.0E10 )
+* .. Scalar Arguments ..
+ COMPLEX TRANSL
+ INTEGER LDA, M, N, NMAX
+ LOGICAL RESET
+ CHARACTER*1 DIAG, UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX A( NMAX, * ), AA( * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J, JJ
+ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER
+* .. External Functions ..
+ COMPLEX CBEG
+ EXTERNAL CBEG
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX, CONJG, REAL
+* .. Executable Statements ..
+ GEN = TYPE.EQ.'ge'
+ HER = TYPE.EQ.'he'
+ SYM = TYPE.EQ.'sy'
+ TRI = TYPE.EQ.'tr'
+ UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U'
+ LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L'
+ UNIT = TRI.AND.DIAG.EQ.'U'
+*
+* Generate data in array A.
+*
+ DO 20 J = 1, N
+ DO 10 I = 1, M
+ IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+ $ THEN
+ A( I, J ) = CBEG( RESET ) + TRANSL
+ IF( I.NE.J )THEN
+* Set some elements to zero
+ IF( N.GT.3.AND.J.EQ.N/2 )
+ $ A( I, J ) = ZERO
+ IF( HER )THEN
+ A( J, I ) = CONJG( A( I, J ) )
+ ELSE IF( SYM )THEN
+ A( J, I ) = A( I, J )
+ ELSE IF( TRI )THEN
+ A( J, I ) = ZERO
+ END IF
+ END IF
+ END IF
+ 10 CONTINUE
+ IF( HER )
+ $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO )
+ IF( TRI )
+ $ A( J, J ) = A( J, J ) + ONE
+ IF( UNIT )
+ $ A( J, J ) = ONE
+ 20 CONTINUE
+*
+* Store elements in array AS in data structure required by routine.
+*
+ IF( TYPE.EQ.'ge' )THEN
+ DO 50 J = 1, N
+ DO 30 I = 1, M
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 30 CONTINUE
+ DO 40 I = M + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN
+ DO 90 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IF( UNIT )THEN
+ IEND = J - 1
+ ELSE
+ IEND = J
+ END IF
+ ELSE
+ IF( UNIT )THEN
+ IBEG = J + 1
+ ELSE
+ IBEG = J
+ END IF
+ IEND = N
+ END IF
+ DO 60 I = 1, IBEG - 1
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 60 CONTINUE
+ DO 70 I = IBEG, IEND
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 70 CONTINUE
+ DO 80 I = IEND + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 80 CONTINUE
+ IF( HER )THEN
+ JJ = J + ( J - 1 )*LDA
+ AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE )
+ END IF
+ 90 CONTINUE
+ END IF
+ RETURN
+*
+* End of CMAKE.
+*
+ END
+ SUBROUTINE CMMCH(TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+ $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+ $ NOUT, MV )
+*
+* Checks the results of the computational tests.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RZERO, RONE
+ PARAMETER ( RZERO = 0.0, RONE = 1.0 )
+* .. Scalar Arguments ..
+ COMPLEX ALPHA, BETA
+ REAL EPS, ERR
+ INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+ LOGICAL FATAL, MV
+ CHARACTER*1 TRANSA, TRANSB
+* .. Array Arguments ..
+ COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ),
+ $ CC( LDCC, * ), CT( * )
+ REAL G( * )
+* .. Local Scalars ..
+ COMPLEX CL
+ REAL ERRI
+ INTEGER I, J, K
+ LOGICAL CTRANA, CTRANB, TRANA, TRANB
+* .. Intrinsic Functions ..
+ INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT
+* .. Statement Functions ..
+ REAL ABS1
+* .. Statement Function definitions ..
+ ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) )
+* .. Executable Statements ..
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+ CTRANA = TRANSA.EQ.'C'
+ CTRANB = TRANSB.EQ.'C'
+*
+* Compute expected result, one column at a time, in CT using data
+* in A, B and C.
+* Compute gauges in G.
+*
+ DO 220 J = 1, N
+*
+ DO 10 I = 1, M
+ CT( I ) = ZERO
+ G( I ) = RZERO
+ 10 CONTINUE
+ IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+ DO 30 K = 1, KK
+ DO 20 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) )
+ 20 CONTINUE
+ 30 CONTINUE
+ ELSE IF( TRANA.AND..NOT.TRANB )THEN
+ IF( CTRANA )THEN
+ DO 50 K = 1, KK
+ DO 40 I = 1, M
+ CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE
+ DO 70 K = 1, KK
+ DO 60 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 60 CONTINUE
+ 70 CONTINUE
+ END IF
+ ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+ IF( CTRANB )THEN
+ DO 90 K = 1, KK
+ DO 80 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 80 CONTINUE
+ 90 CONTINUE
+ ELSE
+ DO 110 K = 1, KK
+ DO 100 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 100 CONTINUE
+ 110 CONTINUE
+ END IF
+ ELSE IF( TRANA.AND.TRANB )THEN
+ IF( CTRANA )THEN
+ IF( CTRANB )THEN
+ DO 130 K = 1, KK
+ DO 120 I = 1, M
+ CT( I ) = CT( I ) + CONJG( A( K, I ) )*
+ $ CONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 120 CONTINUE
+ 130 CONTINUE
+ ELSE
+ DO 150 K = 1, KK
+ DO 140 I = 1, M
+ CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 140 CONTINUE
+ 150 CONTINUE
+ END IF
+ ELSE
+ IF( CTRANB )THEN
+ DO 170 K = 1, KK
+ DO 160 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 160 CONTINUE
+ 170 CONTINUE
+ ELSE
+ DO 190 K = 1, KK
+ DO 180 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 180 CONTINUE
+ 190 CONTINUE
+ END IF
+ END IF
+ END IF
+ DO 200 I = 1, M
+ CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+ G( I ) = ABS1( ALPHA )*G( I ) +
+ $ ABS1( BETA )*ABS1( C( I, J ) )
+ 200 CONTINUE
+*
+* Compute the error ratio for this result.
+*
+ ERR = ZERO
+ DO 210 I = 1, M
+ ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS
+ IF( G( I ).NE.RZERO )
+ $ ERRI = ERRI/G( I )
+ ERR = MAX( ERR, ERRI )
+ IF( ERR*SQRT( EPS ).GE.RONE )
+ $ GO TO 230
+ 210 CONTINUE
+*
+ 220 CONTINUE
+*
+* If the loop completes, all results are at least half accurate.
+ GO TO 250
+*
+* Report fatal error.
+*
+ 230 FATAL = .TRUE.
+ WRITE( NOUT, FMT = 9999 )
+ DO 240 I = 1, M
+ IF( MV )THEN
+ WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+ ELSE
+ WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+ END IF
+ 240 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9997 )J
+*
+ 250 CONTINUE
+ RETURN
+*
+ 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+ $ 'F ACCURATE *******', /' EXPECTED RE',
+ $ 'SULT COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) )
+ 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+* End of CMMCH.
+*
+ END
+ LOGICAL FUNCTION LCE( RI, RJ, LR )
+*
+* Tests if two arrays are identical.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LR
+* .. Array Arguments ..
+ COMPLEX RI( * ), RJ( * )
+* .. Local Scalars ..
+ INTEGER I
+* .. Executable Statements ..
+ DO 10 I = 1, LR
+ IF( RI( I ).NE.RJ( I ) )
+ $ GO TO 20
+ 10 CONTINUE
+ LCE = .TRUE.
+ GO TO 30
+ 20 CONTINUE
+ LCE = .FALSE.
+ 30 RETURN
+*
+* End of LCE.
+*
+ END
+ LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+* Tests if selected elements in two arrays are equal.
+*
+* TYPE is 'ge' or 'he' or 'sy'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LDA, M, N
+ CHARACTER*1 UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX AA( LDA, * ), AS( LDA, * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J
+ LOGICAL UPPER
+* .. Executable Statements ..
+ UPPER = UPLO.EQ.'U'
+ IF( TYPE.EQ.'ge' )THEN
+ DO 20 J = 1, N
+ DO 10 I = M + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 10 CONTINUE
+ 20 CONTINUE
+ ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN
+ DO 50 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IEND = J
+ ELSE
+ IBEG = J
+ IEND = N
+ END IF
+ DO 30 I = 1, IBEG - 1
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 30 CONTINUE
+ DO 40 I = IEND + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 40 CONTINUE
+ 50 CONTINUE
+ END IF
+*
+ 60 CONTINUE
+ LCERES = .TRUE.
+ GO TO 80
+ 70 CONTINUE
+ LCERES = .FALSE.
+ 80 RETURN
+*
+* End of LCERES.
+*
+ END
+ COMPLEX FUNCTION CBEG( RESET )
+*
+* Generates complex numbers as pairs of random numbers uniformly
+* distributed between -0.5 and 0.5.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ LOGICAL RESET
+* .. Local Scalars ..
+ INTEGER I, IC, J, MI, MJ
+* .. Save statement ..
+ SAVE I, IC, J, MI, MJ
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX
+* .. Executable Statements ..
+ IF( RESET )THEN
+* Initialize local variables.
+ MI = 891
+ MJ = 457
+ I = 7
+ J = 7
+ IC = 0
+ RESET = .FALSE.
+ END IF
+*
+* The sequence of values of I or J is bounded between 1 and 999.
+* If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+* If initial I or J = 4 or 8, the period will be 25.
+* If initial I or J = 5, the period will be 10.
+* IC is used to break up the period by skipping 1 value of I or J
+* in 6.
+*
+ IC = IC + 1
+ 10 I = I*MI
+ J = J*MJ
+ I = I - 1000*( I/1000 )
+ J = J - 1000*( J/1000 )
+ IF( IC.GE.5 )THEN
+ IC = 0
+ GO TO 10
+ END IF
+ CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 )
+ RETURN
+*
+* End of CBEG.
+*
+ END
+ REAL FUNCTION SDIFF( X, Y )
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ REAL X, Y
+* .. Executable Statements ..
+ SDIFF = X - Y
+ RETURN
+*
+* End of SDIFF.
+*
+ END
diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c
index df2513514..4be4457b4 100644
--- a/ctest/c_z3chke.c
+++ b/ctest/c_z3chke.c
@@ -45,8 +45,242 @@ void F77_z3chke(char * rout) {
F77_xerbla(cblas_rout,&cblas_info);
}
- if (strncmp( sf,"cblas_zgemm" ,11)==0) {
- cblas_rout = "cblas_zgemm" ;
+
+
+
+
+ if (strncmp( sf,"cblas_zgemm3m" ,13)==0) {
+ cblas_rout = "cblas_zgemm3" ;
+
+ cblas_info = 1;
+ cblas_zgemm3m( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 1;
+ cblas_zgemm3m( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 1;
+ cblas_zgemm3m( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 1;
+ cblas_zgemm3m( INVALID, CblasTrans, CblasTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 2; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 2; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, INVALID, CblasTrans, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 3; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 3; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, INVALID, 0, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = FALSE;
+ cblas_zgemm3m( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 4; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 5; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 6; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 2, BETA, C, 2 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 9; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 2, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 11; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 2, BETA, C, 1 );
+ chkxer();
+ cblas_info = 14; RowMajorStrg = TRUE;
+ cblas_zgemm3m( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0,
+ ALPHA, A, 1, B, 1, BETA, C, 1 );
+ chkxer();
+
+
+
+ } else if (strncmp( sf,"cblas_zgemm" ,11)==0) {
+ cblas_rout = "cblas_zgemm" ;
cblas_info = 1;
cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0,
diff --git a/ctest/c_zblas3.c b/ctest/c_zblas3.c
index ad744110b..46ff467d0 100644
--- a/ctest/c_zblas3.c
+++ b/ctest/c_zblas3.c
@@ -562,3 +562,82 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha,
a, *lda, b, *ldb);
}
+
+
+void F77_zgemm3m(int *order, char *transpa, char *transpb, int *m, int *n,
+ int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda,
+ CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta,
+ CBLAS_TEST_ZOMPLEX *c, int *ldc ) {
+
+ CBLAS_TEST_ZOMPLEX *A, *B, *C;
+ int i,j,LDA, LDB, LDC;
+ enum CBLAS_TRANSPOSE transa, transb;
+
+ get_transpose_type(transpa, &transa);
+ get_transpose_type(transpb, &transb);
+
+ if (*order == TEST_ROW_MJR) {
+ if (transa == CblasNoTrans) {
+ LDA = *k+1;
+ A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
+ for( i=0; i<*m; i++ )
+ for( j=0; j<*k; j++ ) {
+ A[i*LDA+j].real=a[j*(*lda)+i].real;
+ A[i*LDA+j].imag=a[j*(*lda)+i].imag;
+ }
+ }
+ else {
+ LDA = *m+1;
+ A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
+ for( i=0; i<*k; i++ )
+ for( j=0; j<*m; j++ ) {
+ A[i*LDA+j].real=a[j*(*lda)+i].real;
+ A[i*LDA+j].imag=a[j*(*lda)+i].imag;
+ }
+ }
+
+ if (transb == CblasNoTrans) {
+ LDB = *n+1;
+ B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
+ for( i=0; i<*k; i++ )
+ for( j=0; j<*n; j++ ) {
+ B[i*LDB+j].real=b[j*(*ldb)+i].real;
+ B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
+ }
+ }
+ else {
+ LDB = *k+1;
+ B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
+ for( i=0; i<*n; i++ )
+ for( j=0; j<*k; j++ ) {
+ B[i*LDB+j].real=b[j*(*ldb)+i].real;
+ B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
+ }
+ }
+
+ LDC = *n+1;
+ C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
+ for( j=0; j<*n; j++ )
+ for( i=0; i<*m; i++ ) {
+ C[i*LDC+j].real=c[j*(*ldc)+i].real;
+ C[i*LDC+j].imag=c[j*(*ldc)+i].imag;
+ }
+ cblas_zgemm3m( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA,
+ B, LDB, beta, C, LDC );
+ for( j=0; j<*n; j++ )
+ for( i=0; i<*m; i++ ) {
+ c[j*(*ldc)+i].real=C[i*LDC+j].real;
+ c[j*(*ldc)+i].imag=C[i*LDC+j].imag;
+ }
+ free(A);
+ free(B);
+ free(C);
+ }
+ else if (*order == TEST_COL_MJR)
+ cblas_zgemm3m( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda,
+ b, *ldb, beta, c, *ldc );
+ else
+ cblas_zgemm3m( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda,
+ b, *ldb, beta, c, *ldc );
+}
+
diff --git a/ctest/c_zblat3_3m.f b/ctest/c_zblat3_3m.f
new file mode 100644
index 000000000..7390d8712
--- /dev/null
+++ b/ctest/c_zblat3_3m.f
@@ -0,0 +1,2791 @@
+ PROGRAM ZBLAT3
+*
+* Test program for the COMPLEX*16 Level 3 Blas.
+*
+* The program must be driven by a short data file. The first 13 records
+* of the file are read using list-directed input, the last 9 records
+* are read using the format ( A13,L2 ). An annotated example of a data
+* file can be obtained by deleting the first 3 characters from the
+* following 22 lines:
+* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+* F LOGICAL FLAG, T TO STOP ON FAILURES.
+* T LOGICAL FLAG, T TO TEST ERROR EXITS.
+* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
+* 16.0 THRESHOLD VALUE OF TEST RATIO
+* 6 NUMBER OF VALUES OF N
+* 0 1 2 3 5 9 VALUES OF N
+* 3 NUMBER OF VALUES OF ALPHA
+* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+* 3 NUMBER OF VALUES OF BETA
+* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZHERK T PUT F FOR NO TEST. SAME COLUMNS.
+* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS.
+* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*
+* See:
+*
+* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+* A Set of Level 3 Basic Linear Algebra Subprograms.
+*
+* Technical Memorandum No.88 (Revision 1), Mathematics and
+* Computer Science Division, Argonne National Laboratory, 9700
+* South Cass Avenue, Argonne, Illinois 60439, US.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ INTEGER NIN, NOUT
+ PARAMETER ( NIN = 5, NOUT = 6 )
+ INTEGER NSUBS
+ PARAMETER ( NSUBS = 9 )
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ $ ONE = ( 1.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO, RHALF, RONE
+ PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+ INTEGER NMAX
+ PARAMETER ( NMAX = 65 )
+ INTEGER NIDMAX, NALMAX, NBEMAX
+ PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+* .. Local Scalars ..
+ DOUBLE PRECISION EPS, ERR, THRESH
+ INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA,
+ $ LAYOUT
+ LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+ $ TSTERR, CORDER, RORDER
+ CHARACTER*1 TRANSA, TRANSB
+ CHARACTER*13 SNAMET
+ CHARACTER*32 SNAPS
+* .. Local Arrays ..
+ COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+ $ ALF( NALMAX ), AS( NMAX*NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBEMAX ),
+ $ BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDMAX )
+ LOGICAL LTEST( NSUBS )
+ CHARACTER*13 SNAMES( NSUBS )
+* .. External Functions ..
+ DOUBLE PRECISION DDIFF
+ LOGICAL LZE
+ EXTERNAL DDIFF, LZE
+* .. External Subroutines ..
+ EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5,ZMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+ CHARACTER*13 SRNAMT
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+ COMMON /SRNAMC/SRNAMT
+* .. Data statements ..
+ DATA SNAMES/'cblas_zgemm3m ', 'cblas_zhemm ',
+ $ 'cblas_zsymm ', 'cblas_ztrmm ', 'cblas_ztrsm ',
+ $ 'cblas_zherk ', 'cblas_zsyrk ', 'cblas_zher2k',
+ $ 'cblas_zsyr2k'/
+* .. Executable Statements ..
+*
+ NOUTC = NOUT
+*
+* Read name and unit number for snapshot output file and open file.
+*
+ READ( NIN, FMT = * )SNAPS
+ READ( NIN, FMT = * )NTRA
+ TRACE = NTRA.GE.0
+ IF( TRACE )THEN
+ OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+ END IF
+* Read the flag that directs rewinding of the snapshot file.
+ READ( NIN, FMT = * )REWI
+ REWI = REWI.AND.TRACE
+* Read the flag that directs stopping on any failure.
+ READ( NIN, FMT = * )SFATAL
+* Read the flag that indicates whether error exits are to be tested.
+ READ( NIN, FMT = * )TSTERR
+* Read the flag that indicates whether row-major data layout to be tested.
+ READ( NIN, FMT = * )LAYOUT
+* Read the threshold value of the test ratio
+ READ( NIN, FMT = * )THRESH
+*
+* Read and check the parameter values for the tests.
+*
+* Values of N
+ READ( NIN, FMT = * )NIDIM
+ IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+ DO 10 I = 1, NIDIM
+ IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+ WRITE( NOUT, FMT = 9996 )NMAX
+ GO TO 220
+ END IF
+ 10 CONTINUE
+* Values of ALPHA
+ READ( NIN, FMT = * )NALF
+ IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+* Values of BETA
+ READ( NIN, FMT = * )NBET
+ IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+* Report values of parameters.
+*
+ WRITE( NOUT, FMT = 9995 )
+ WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+ WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+ WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+ IF( .NOT.TSTERR )THEN
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9984 )
+ END IF
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9999 )THRESH
+ WRITE( NOUT, FMT = * )
+
+ RORDER = .FALSE.
+ CORDER = .FALSE.
+ IF (LAYOUT.EQ.2) THEN
+ RORDER = .TRUE.
+ CORDER = .TRUE.
+ WRITE( *, FMT = 10002 )
+ ELSE IF (LAYOUT.EQ.1) THEN
+ RORDER = .TRUE.
+ WRITE( *, FMT = 10001 )
+ ELSE IF (LAYOUT.EQ.0) THEN
+ CORDER = .TRUE.
+ WRITE( *, FMT = 10000 )
+ END IF
+ WRITE( *, FMT = * )
+
+*
+* Read names of subroutines and flags which indicate
+* whether they are to be tested.
+*
+ DO 20 I = 1, NSUBS
+ LTEST( I ) = .FALSE.
+ 20 CONTINUE
+ 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+ DO 40 I = 1, NSUBS
+ IF( SNAMET.EQ.SNAMES( I ) )
+ $ GO TO 50
+ 40 CONTINUE
+ WRITE( NOUT, FMT = 9990 )SNAMET
+ STOP
+ 50 LTEST( I ) = LTESTT
+ GO TO 30
+*
+ 60 CONTINUE
+ CLOSE ( NIN )
+*
+* Compute EPS (the machine precision).
+*
+ EPS = RONE
+ 70 CONTINUE
+ IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
+ $ GO TO 80
+ EPS = RHALF*EPS
+ GO TO 70
+ 80 CONTINUE
+ EPS = EPS + EPS
+ WRITE( NOUT, FMT = 9998 )EPS
+*
+* Check the reliability of ZMMCH using exact data.
+*
+ N = MIN( 32, NMAX )
+ DO 100 J = 1, N
+ DO 90 I = 1, N
+ AB( I, J ) = MAX( I - J + 1, 0 )
+ 90 CONTINUE
+ AB( J, NMAX + 1 ) = J
+ AB( 1, NMAX + J ) = J
+ C( J, 1 ) = ZERO
+ 100 CONTINUE
+ DO 110 J = 1, N
+ CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+ 110 CONTINUE
+* CC holds the exact result. On exit from ZMMCH CT holds
+* the result computed by ZMMCH.
+ TRANSA = 'N'
+ TRANSB = 'N'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ DO 120 J = 1, N
+ AB( J, NMAX + 1 ) = N - J + 1
+ AB( 1, NMAX + J ) = N - J + 1
+ 120 CONTINUE
+ DO 130 J = 1, N
+ CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+ $ ( ( J + 1 )*J*( J - 1 ) )/3
+ 130 CONTINUE
+ TRANSA = 'C'
+ TRANSB = 'N'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+*
+* Test each subroutine in turn.
+*
+ DO 200 ISNUM = 1, NSUBS
+ WRITE( NOUT, FMT = * )
+ IF( .NOT.LTEST( ISNUM ) )THEN
+* Subprogram is not to be tested.
+ WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+ ELSE
+ SRNAMT = SNAMES( ISNUM )
+* Test error exits.
+ IF( TSTERR )THEN
+ CALL CZ3CHKE( SNAMES( ISNUM ) )
+ WRITE( NOUT, FMT = * )
+ END IF
+* Test computations.
+ INFOT = 0
+ OK = .TRUE.
+ FATAL = .FALSE.
+ GO TO ( 140, 150, 150, 160, 160, 170, 170,
+ $ 180, 180 )ISNUM
+* Test ZGEMM, 01.
+ 140 IF (CORDER) THEN
+ CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 1 )
+ END IF
+ GO TO 190
+* Test ZHEMM, 02, ZSYMM, 03.
+ 150 IF (CORDER) THEN
+ CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 1 )
+ END IF
+ GO TO 190
+* Test ZTRMM, 04, ZTRSM, 05.
+ 160 IF (CORDER) THEN
+ CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+ $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C,
+ $ 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+ $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C,
+ $ 1 )
+ END IF
+ GO TO 190
+* Test ZHERK, 06, ZSYRK, 07.
+ 170 IF (CORDER) THEN
+ CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G, 1 )
+ END IF
+ GO TO 190
+* Test ZHER2K, 08, ZSYR2K, 09.
+ 180 IF (CORDER) THEN
+ CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
+ $ 0 )
+ END IF
+ IF (RORDER) THEN
+ CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
+ $ 1 )
+ END IF
+ GO TO 190
+*
+ 190 IF( FATAL.AND.SFATAL )
+ $ GO TO 210
+ END IF
+ 200 CONTINUE
+ WRITE( NOUT, FMT = 9986 )
+ GO TO 230
+*
+ 210 CONTINUE
+ WRITE( NOUT, FMT = 9985 )
+ GO TO 230
+*
+ 220 CONTINUE
+ WRITE( NOUT, FMT = 9991 )
+*
+ 230 CONTINUE
+ IF( TRACE )
+ $ CLOSE ( NTRA )
+ CLOSE ( NOUT )
+ STOP
+*
+10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' )
+10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' )
+10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' )
+ 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+ $ 'S THAN', F8.2 )
+ 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 )
+ 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+ $ 'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT('TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F',
+ $ 'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( ' FOR N ', 9I6 )
+ 9993 FORMAT( ' FOR ALPHA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9992 FORMAT( ' FOR BETA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+ $ /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT(' SUBPROGRAM NAME ', A13,' NOT RECOGNIZED', /' ******* T',
+ $ 'ESTS ABANDONED *******' )
+ 9989 FORMAT(' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU',
+ $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1,
+ $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+ $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+ $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+ $ '*******' )
+ 9988 FORMAT( A13,L2 )
+ 9987 FORMAT( 1X, A13,' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+* End of ZBLAT3.
+*
+ END
+ SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ IORDER )
+*
+* Tests ZGEMM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BLS
+ DOUBLE PRECISION ERR, ERRMAX
+ INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+ $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+ $ MA, MB, MS, N, NA, NARGS, NB, NC, NS
+ LOGICAL NULL, RESET, SAME, TRANA, TRANB
+ CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB
+ CHARACTER*3 ICH
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL CZGEMM3M, ZMAKE, ZMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICH/'NTC'/
+* .. Executable Statements ..
+*
+ NARGS = 13
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 110 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICA = 1, 3
+ TRANSA = ICH( ICA: ICA )
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+ IF( TRANA )THEN
+ MA = K
+ NA = M
+ ELSE
+ MA = M
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICB = 1, 3
+ TRANSB = ICH( ICB: ICB )
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+ IF( TRANB )THEN
+ MB = N
+ NB = K
+ ELSE
+ MB = K
+ NB = N
+ END IF
+* Set LDB to 1 more than minimum value if room.
+ LDB = MB
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 70
+ LBB = LDB*NB
+*
+* Generate the matrix B.
+*
+ CALL ZMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB,
+ $ LDB, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX,
+ $ CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ TRANAS = TRANSA
+ TRANBS = TRANSB
+ MS = M
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ CALL ZPRCN1(NTRA, NC, SNAME, IORDER,
+ $ TRANSA, TRANSB, M, N, K, ALPHA, LDA,
+ $ LDB, BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CZGEMM3M( IORDER, TRANSA, TRANSB, M, N,
+ $ K, ALPHA, AA, LDA, BB, LDB,
+ $ BETA, CC, LDC )
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = TRANSA.EQ.TRANAS
+ ISAME( 2 ) = TRANSB.EQ.TRANBS
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = KS.EQ.K
+ ISAME( 6 ) = ALS.EQ.ALPHA
+ ISAME( 7 ) = LZE( AS, AA, LAA )
+ ISAME( 8 ) = LDAS.EQ.LDA
+ ISAME( 9 ) = LZE( BS, BB, LBB )
+ ISAME( 10 ) = LDBS.EQ.LDB
+ ISAME( 11 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 12 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 12 ) = LZERES( 'ge', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 13 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report
+* and return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ CALL ZMMCH( TRANSA, TRANSB, M, N, K,
+ $ ALPHA, A, NMAX, B, NMAX, BETA,
+ $ C, NMAX, CT, G, CC, LDC, EPS,
+ $ ERR, FATAL, NOUT, .TRUE. )
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 120
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB,
+ $ M, N, K, ALPHA, LDA, LDB, BETA, LDC)
+*
+ 130 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A13,'(''', A1, ''',''', A1, ''',',
+ $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3,
+ $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' )
+ 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK1.
+*
+ END
+*
+ SUBROUTINE ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N,
+ $ K, ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC
+ DOUBLE COMPLEX ALPHA, BETA
+ CHARACTER*1 TRANSA, TRANSB
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CTA,CTB
+
+ IF (TRANSA.EQ.'N')THEN
+ CTA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CTA = ' CblasTrans'
+ ELSE
+ CTA = 'CblasConjTrans'
+ END IF
+ IF (TRANSB.EQ.'N')THEN
+ CTB = ' CblasNoTrans'
+ ELSE IF (TRANSB.EQ.'T')THEN
+ CTB = ' CblasTrans'
+ ELSE
+ CTB = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB
+ WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',')
+ 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,',
+ $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' )
+ END
+*
+ SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ IORDER )
+*
+* Tests ZHEMM and ZSYMM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BLS
+ DOUBLE PRECISION ERR, ERRMAX
+ INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+ $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, LEFT, NULL, RESET, SAME
+ CHARACTER*1 SIDE, SIDES, UPLO, UPLOS
+ CHARACTER*2 ICHS, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL CZHEMM, ZMAKE, ZMMCH, CZSYMM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHS/'LR'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 8: 9 ).EQ.'he'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 100 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 90 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 90
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 90
+ LBB = LDB*N
+*
+* Generate the matrix B.
+*
+ CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+ $ ZERO )
+*
+ DO 80 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+*
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+* Generate the hermitian or symmetric matrix A.
+*
+ CALL ZMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX,
+ $ AA, LDA, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC,
+ $ LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, M, N, ALPHA, LDA, LDB,
+ $ BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ IF( CONJ )THEN
+ CALL CZHEMM( IORDER, SIDE, UPLO, M, N,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ CC, LDC )
+ ELSE
+ CALL CZSYMM( IORDER, SIDE, UPLO, M, N,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LZE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LZE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ ISAME( 10 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 11 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LZERES( 'ge', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A,
+ $ NMAX, B, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B,
+ $ NMAX, A, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 120
+*
+ 110 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA,
+ $ LDB, BETA, LDC)
+*
+ 120 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK2.
+*
+ END
+*
+ SUBROUTINE ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N,
+ $ ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC
+ DOUBLE COMPLEX ALPHA, BETA
+ CHARACTER*1 SIDE, UPLO
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CS,CU
+
+ IF (SIDE.EQ.'L')THEN
+ CS = ' CblasLeft'
+ ELSE
+ CS = ' CblasRight'
+ END IF
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
+ WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',')
+ 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3,
+ $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' )
+ END
+*
+ SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+ $ B, BB, BS, CT, G, C, IORDER )
+*
+* Tests ZTRMM and ZTRSM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS
+ DOUBLE PRECISION ERR, ERRMAX
+ INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+ $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+ $ NS
+ LOGICAL LEFT, NULL, RESET, SAME
+ CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+ $ UPLOS
+ CHARACTER*2 ICHD, ICHS, ICHU
+ CHARACTER*3 ICHT
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL ZMAKE, ZMMCH, CZTRMM, CZTRSM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+* .. Executable Statements ..
+*
+ NARGS = 11
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+* Set up zero matrix for ZMMCH.
+ DO 20 J = 1, NMAX
+ DO 10 I = 1, NMAX
+ C( I, J ) = ZERO
+ 10 CONTINUE
+ 20 CONTINUE
+*
+ DO 140 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 130
+ LBB = LDB*N
+ NULL = M.LE.0.OR.N.LE.0
+*
+ DO 120 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 130
+ LAA = LDA*NA
+*
+ DO 110 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+ DO 100 ICT = 1, 3
+ TRANSA = ICHT( ICT: ICT )
+*
+ DO 90 ICD = 1, 2
+ DIAG = ICHD( ICD: ICD )
+*
+ DO 80 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+* Generate the matrix A.
+*
+ CALL ZMAKE( 'tr', UPLO, DIAG, NA, NA, A,
+ $ NMAX, AA, LDA, RESET, ZERO )
+*
+* Generate the matrix B.
+*
+ CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX,
+ $ BB, LDB, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ TRANAS = TRANSA
+ DIAGS = DIAG
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 30 I = 1, LAA
+ AS( I ) = AA( I )
+ 30 CONTINUE
+ LDAS = LDA
+ DO 40 I = 1, LBB
+ BS( I ) = BB( I )
+ 40 CONTINUE
+ LDBS = LDB
+*
+* Call the subroutine.
+*
+ IF( SNAME( 10: 11 ).EQ.'mm' )THEN
+ IF( TRACE )
+ $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CZTRMM(IORDER, SIDE, UPLO, TRANSA,
+ $ DIAG, M, N, ALPHA, AA, LDA,
+ $ BB, LDB )
+ ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN
+ IF( TRACE )
+ $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CZTRSM(IORDER, SIDE, UPLO, TRANSA,
+ $ DIAG, M, N, ALPHA, AA, LDA,
+ $ BB, LDB )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = TRANAS.EQ.TRANSA
+ ISAME( 4 ) = DIAGS.EQ.DIAG
+ ISAME( 5 ) = MS.EQ.M
+ ISAME( 6 ) = NS.EQ.N
+ ISAME( 7 ) = ALS.EQ.ALPHA
+ ISAME( 8 ) = LZE( AS, AA, LAA )
+ ISAME( 9 ) = LDAS.EQ.LDA
+ IF( NULL )THEN
+ ISAME( 10 ) = LZE( BS, BB, LBB )
+ ELSE
+ ISAME( 10 ) = LZERES( 'ge', ' ', M, N, BS,
+ $ BB, LDB )
+ END IF
+ ISAME( 11 ) = LDBS.EQ.LDB
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 50 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 50 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+ IF( SNAME( 10: 11 ).EQ.'mm' )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL ZMMCH( TRANSA, 'N', M, N, M,
+ $ ALPHA, A, NMAX, B, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL ZMMCH( 'N', TRANSA, M, N, N,
+ $ ALPHA, B, NMAX, A, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN
+*
+* Compute approximation to original
+* matrix.
+*
+ DO 70 J = 1, N
+ DO 60 I = 1, M
+ C( I, J ) = BB( I + ( J - 1 )*
+ $ LDB )
+ BB( I + ( J - 1 )*LDB ) = ALPHA*
+ $ B( I, J )
+ 60 CONTINUE
+ 70 CONTINUE
+*
+ IF( LEFT )THEN
+ CALL ZMMCH( TRANSA, 'N', M, N, M,
+ $ ONE, A, NMAX, C, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ ELSE
+ CALL ZMMCH( 'N', TRANSA, M, N, N,
+ $ ONE, C, NMAX, A, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ END IF
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 150
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+ 140 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG,
+ $ M, N, ALPHA, LDA, LDB)
+*
+ 160 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT(' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT(1X, I6, ': ', A13,'(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ',
+ $ ' .' )
+ 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK3.
+*
+ END
+*
+ SUBROUTINE ZPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA,
+ $ DIAG, M, N, ALPHA, LDA, LDB)
+ INTEGER NOUT, NC, IORDER, M, N, LDA, LDB
+ DOUBLE COMPLEX ALPHA
+ CHARACTER*1 SIDE, UPLO, TRANSA, DIAG
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CS, CU, CA, CD
+
+ IF (SIDE.EQ.'L')THEN
+ CS = ' CblasLeft'
+ ELSE
+ CS = ' CblasRight'
+ END IF
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (DIAG.EQ.'N')THEN
+ CD = ' CblasNonUnit'
+ ELSE
+ CD = ' CblasUnit'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU
+ WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', A15, ',', A14, ',', A14, ',')
+ 9994 FORMAT( 10X, 2( A15, ',') , 2( I3, ',' ), ' (', F4.1, ',',
+ $ F4.1, '), A,', I3, ', B,', I3, ').' )
+ END
+*
+ SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G,
+ $ IORDER )
+*
+* Tests ZHERK and ZSYRK.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RONE, RZERO
+ PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BETS
+ DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+ $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL CZHERK, ZMAKE, ZMMCH, CZSYRK
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX, MAX, DBLE
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 8: 9 ).EQ.'he'
+*
+ NARGS = 10
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+ IF( CONJ )THEN
+ RALPHA = DBLE( ALPHA )
+ ALPHA = DCMPLX( RALPHA, RZERO )
+ END IF
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = DBLE( BETA )
+ BETA = DCMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ.
+ $ RZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ IF( CONJ )THEN
+ RALS = RALPHA
+ ELSE
+ ALS = ALPHA
+ END IF
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 20 I = 1, LCC
+ CS( I ) = CC( I )
+ 20 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ CALL ZPRCN6( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA,
+ $ LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CZHERK( IORDER, UPLO, TRANS, N, K,
+ $ RALPHA, AA, LDA, RBETA, CC,
+ $ LDC )
+ ELSE
+ IF( TRACE )
+ $ CALL ZPRCN4( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CZSYRK( IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, AA, LDA, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ IF( CONJ )THEN
+ ISAME( 5 ) = RALS.EQ.RALPHA
+ ELSE
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ END IF
+ ISAME( 6 ) = LZE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ IF( CONJ )THEN
+ ISAME( 8 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 8 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 9 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 9 ) = LZERES( SNAME( 8: 9 ), UPLO, N,
+ $ N, CS, CC, LDC )
+ END IF
+ ISAME( 10 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 30 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 30 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JC = 1
+ DO 40 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ CALL ZMMCH( TRANST, 'N', LJ, 1, K,
+ $ ALPHA, A( 1, JJ ), NMAX,
+ $ A( 1, J ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL ZMMCH( 'N', TRANST, LJ, 1, K,
+ $ ALPHA, A( JJ, 1 ), NMAX,
+ $ A( J, 1 ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ 40 CONTINUE
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 110 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ CALL ZPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA,
+ $ LDA, rBETA, LDC)
+ ELSE
+ CALL ZPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, BETA, LDC)
+ END IF
+*
+ 130 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ',
+ $ ' .' )
+ 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1,
+ $ '), C,', I3, ') .' )
+ 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK4.
+*
+ END
+*
+ SUBROUTINE ZPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDC
+ DOUBLE COMPLEX ALPHA, BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,',
+ $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' )
+ END
+*
+*
+ SUBROUTINE ZPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDC
+ DOUBLE PRECISION ALPHA, BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ),
+ $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' )
+ END
+*
+ SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W,
+ $ IORDER )
+*
+* Tests ZHER2K and ZSYR2K.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RONE, RZERO
+ PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*13 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+ $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+ $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BETS
+ DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+ $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+ $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL CZHER2K, ZMAKE, ZMMCH, CZSYR2K
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX, DCONJG, MAX, DBLE
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 8: 9 ).EQ.'he'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 130
+ LCC = LDC*N
+*
+ DO 120 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 110 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 110
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ IF( TRAN )THEN
+ CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+ $ LDA, RESET, ZERO )
+ ELSE
+ CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+ $ RESET, ZERO )
+ END IF
+*
+* Generate the matrix B.
+*
+ LDB = LDA
+ LBB = LAA
+ IF( TRAN )THEN
+ CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ),
+ $ 2*NMAX, BB, LDB, RESET, ZERO )
+ ELSE
+ CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+ $ NMAX, BB, LDB, RESET, ZERO )
+ END IF
+*
+ DO 100 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 90 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 80 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = DBLE( BETA )
+ BETA = DCMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ.
+ $ ZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ CALL ZPRCN7( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, ALPHA, LDA, LDB,
+ $ RBETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CZHER2K( IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, AA, LDA, BB, LDB, RBETA,
+ $ CC, LDC )
+ ELSE
+ IF( TRACE )
+ $ CALL ZPRCN5( NTRA, NC, SNAME, IORDER,
+ $ UPLO, TRANS, N, K, ALPHA, LDA, LDB,
+ $ BETA, LDC)
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CZSYR2K( IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, AA, LDA, BB, LDB, BETA,
+ $ CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LZE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LZE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ IF( CONJ )THEN
+ ISAME( 10 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 10 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 11 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LZERES( 'he', UPLO, N, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JJAB = 1
+ JC = 1
+ DO 70 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ DO 50 I = 1, K
+ W( I ) = ALPHA*AB( ( J - 1 )*2*
+ $ NMAX + K + I )
+ IF( CONJ )THEN
+ W( K + I ) = DCONJG( ALPHA )*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ ELSE
+ W( K + I ) = ALPHA*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ END IF
+ 50 CONTINUE
+ CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K,
+ $ ONE, AB( JJAB ), 2*NMAX, W,
+ $ 2*NMAX, BETA, C( JJ, J ),
+ $ NMAX, CT, G, CC( JC ), LDC,
+ $ EPS, ERR, FATAL, NOUT,
+ $ .TRUE. )
+ ELSE
+ DO 60 I = 1, K
+ IF( CONJ )THEN
+ W( I ) = ALPHA*DCONJG( AB( ( K +
+ $ I - 1 )*NMAX + J ) )
+ W( K + I ) = DCONJG( ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J ) )
+ ELSE
+ W( I ) = ALPHA*AB( ( K + I - 1 )*
+ $ NMAX + J )
+ W( K + I ) = ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J )
+ END IF
+ 60 CONTINUE
+ CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE,
+ $ AB( JJ ), NMAX, W, 2*NMAX,
+ $ BETA, C( JJ, J ), NMAX, CT,
+ $ G, CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ IF( TRAN )
+ $ JJAB = JJAB + 2*NMAX
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 140
+ 70 CONTINUE
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC
+ ELSE
+ IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX
+ IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 140 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ CALL ZPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, LDA, LDB, RBETA, LDC)
+ ELSE
+ CALL ZPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K,
+ $ ALPHA, LDA, LDB, BETA, LDC)
+ END IF
+*
+ 160 CONTINUE
+ RETURN
+*
+10003 FORMAT( ' ', A13,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10002 FORMAT( ' ', A13,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ',
+ $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ',
+ $ 'RATIO ', F8.2, ' - SUSPECT *******' )
+10001 FORMAT( ' ', A13,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+10000 FORMAT( ' ', A13,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS',
+ $ ' (', I6, ' CALL', 'S)' )
+ 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9996 FORMAT( ' ******* ', A13,' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1,
+ $ ', C,', I3, ') .' )
+ 9993 FORMAT(1X, I6, ': ', A13,'(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK5.
+*
+ END
+*
+ SUBROUTINE ZPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC
+ DOUBLE COMPLEX ALPHA, BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,',
+ $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' )
+ END
+*
+*
+ SUBROUTINE ZPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA,
+ $ N, K, ALPHA, LDA, LDB, BETA, LDC)
+ INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC
+ DOUBLE COMPLEX ALPHA
+ DOUBLE PRECISION BETA
+ CHARACTER*1 UPLO, TRANSA
+ CHARACTER*13 SNAME
+ CHARACTER*14 CRC, CU, CA
+
+ IF (UPLO.EQ.'U')THEN
+ CU = ' CblasUpper'
+ ELSE
+ CU = ' CblasLower'
+ END IF
+ IF (TRANSA.EQ.'N')THEN
+ CA = ' CblasNoTrans'
+ ELSE IF (TRANSA.EQ.'T')THEN
+ CA = ' CblasTrans'
+ ELSE
+ CA = 'CblasConjTrans'
+ END IF
+ IF (IORDER.EQ.1)THEN
+ CRC = ' CblasRowMajor'
+ ELSE
+ CRC = ' CblasColMajor'
+ END IF
+ WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA
+ WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC
+
+ 9995 FORMAT( 1X, I6, ': ', A13,'(', 3( A15, ',') )
+ 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,',
+ $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' )
+ END
+*
+ SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+ $ TRANSL )
+*
+* Generates values for an M by N matrix A.
+* Stores the values in the array AA in the data structure required
+* by the routine, with unwanted elements set to rogue value.
+*
+* TYPE is 'ge', 'he', 'sy' or 'tr'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ $ ONE = ( 1.0D0, 0.0D0 ) )
+ COMPLEX*16 ROGUE
+ PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0D0 )
+ DOUBLE PRECISION RROGUE
+ PARAMETER ( RROGUE = -1.0D10 )
+* .. Scalar Arguments ..
+ COMPLEX*16 TRANSL
+ INTEGER LDA, M, N, NMAX
+ LOGICAL RESET
+ CHARACTER*1 DIAG, UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, * ), AA( * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J, JJ
+ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER
+* .. External Functions ..
+ COMPLEX*16 ZBEG
+ EXTERNAL ZBEG
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX, DCONJG, DBLE
+* .. Executable Statements ..
+ GEN = TYPE.EQ.'ge'
+ HER = TYPE.EQ.'he'
+ SYM = TYPE.EQ.'sy'
+ TRI = TYPE.EQ.'tr'
+ UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U'
+ LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L'
+ UNIT = TRI.AND.DIAG.EQ.'U'
+*
+* Generate data in array A.
+*
+ DO 20 J = 1, N
+ DO 10 I = 1, M
+ IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+ $ THEN
+ A( I, J ) = ZBEG( RESET ) + TRANSL
+ IF( I.NE.J )THEN
+* Set some elements to zero
+ IF( N.GT.3.AND.J.EQ.N/2 )
+ $ A( I, J ) = ZERO
+ IF( HER )THEN
+ A( J, I ) = DCONJG( A( I, J ) )
+ ELSE IF( SYM )THEN
+ A( J, I ) = A( I, J )
+ ELSE IF( TRI )THEN
+ A( J, I ) = ZERO
+ END IF
+ END IF
+ END IF
+ 10 CONTINUE
+ IF( HER )
+ $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO )
+ IF( TRI )
+ $ A( J, J ) = A( J, J ) + ONE
+ IF( UNIT )
+ $ A( J, J ) = ONE
+ 20 CONTINUE
+*
+* Store elements in array AS in data structure required by routine.
+*
+ IF( TYPE.EQ.'ge' )THEN
+ DO 50 J = 1, N
+ DO 30 I = 1, M
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 30 CONTINUE
+ DO 40 I = M + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN
+ DO 90 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IF( UNIT )THEN
+ IEND = J - 1
+ ELSE
+ IEND = J
+ END IF
+ ELSE
+ IF( UNIT )THEN
+ IBEG = J + 1
+ ELSE
+ IBEG = J
+ END IF
+ IEND = N
+ END IF
+ DO 60 I = 1, IBEG - 1
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 60 CONTINUE
+ DO 70 I = IBEG, IEND
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 70 CONTINUE
+ DO 80 I = IEND + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 80 CONTINUE
+ IF( HER )THEN
+ JJ = J + ( J - 1 )*LDA
+ AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE )
+ END IF
+ 90 CONTINUE
+ END IF
+ RETURN
+*
+* End of ZMAKE.
+*
+ END
+ SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+ $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+ $ NOUT, MV )
+*
+* Checks the results of the computational tests.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO, RONE
+ PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 )
+* .. Scalar Arguments ..
+ COMPLEX*16 ALPHA, BETA
+ DOUBLE PRECISION EPS, ERR
+ INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+ LOGICAL FATAL, MV
+ CHARACTER*1 TRANSA, TRANSB
+* .. Array Arguments ..
+ COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ),
+ $ CC( LDCC, * ), CT( * )
+ DOUBLE PRECISION G( * )
+* .. Local Scalars ..
+ COMPLEX*16 CL
+ DOUBLE PRECISION ERRI
+ INTEGER I, J, K
+ LOGICAL CTRANA, CTRANB, TRANA, TRANB
+* .. Intrinsic Functions ..
+ INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT
+* .. Statement Functions ..
+ DOUBLE PRECISION ABS1
+* .. Statement Function definitions ..
+ ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) )
+* .. Executable Statements ..
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+ CTRANA = TRANSA.EQ.'C'
+ CTRANB = TRANSB.EQ.'C'
+*
+* Compute expected result, one column at a time, in CT using data
+* in A, B and C.
+* Compute gauges in G.
+*
+ DO 220 J = 1, N
+*
+ DO 10 I = 1, M
+ CT( I ) = ZERO
+ G( I ) = RZERO
+ 10 CONTINUE
+ IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+ DO 30 K = 1, KK
+ DO 20 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) )
+ 20 CONTINUE
+ 30 CONTINUE
+ ELSE IF( TRANA.AND..NOT.TRANB )THEN
+ IF( CTRANA )THEN
+ DO 50 K = 1, KK
+ DO 40 I = 1, M
+ CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE
+ DO 70 K = 1, KK
+ DO 60 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 60 CONTINUE
+ 70 CONTINUE
+ END IF
+ ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+ IF( CTRANB )THEN
+ DO 90 K = 1, KK
+ DO 80 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 80 CONTINUE
+ 90 CONTINUE
+ ELSE
+ DO 110 K = 1, KK
+ DO 100 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 100 CONTINUE
+ 110 CONTINUE
+ END IF
+ ELSE IF( TRANA.AND.TRANB )THEN
+ IF( CTRANA )THEN
+ IF( CTRANB )THEN
+ DO 130 K = 1, KK
+ DO 120 I = 1, M
+ CT( I ) = CT( I ) + DCONJG( A( K, I ) )*
+ $ DCONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 120 CONTINUE
+ 130 CONTINUE
+ ELSE
+ DO 150 K = 1, KK
+ DO 140 I = 1, M
+ CT( I ) = CT( I ) + DCONJG( A( K, I ) )*
+ $ B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 140 CONTINUE
+ 150 CONTINUE
+ END IF
+ ELSE
+ IF( CTRANB )THEN
+ DO 170 K = 1, KK
+ DO 160 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*
+ $ DCONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 160 CONTINUE
+ 170 CONTINUE
+ ELSE
+ DO 190 K = 1, KK
+ DO 180 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 180 CONTINUE
+ 190 CONTINUE
+ END IF
+ END IF
+ END IF
+ DO 200 I = 1, M
+ CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+ G( I ) = ABS1( ALPHA )*G( I ) +
+ $ ABS1( BETA )*ABS1( C( I, J ) )
+ 200 CONTINUE
+*
+* Compute the error ratio for this result.
+*
+ ERR = ZERO
+ DO 210 I = 1, M
+ ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS
+ IF( G( I ).NE.RZERO )
+ $ ERRI = ERRI/G( I )
+ ERR = MAX( ERR, ERRI )
+ IF( ERR*SQRT( EPS ).GE.RONE )
+ $ GO TO 230
+ 210 CONTINUE
+*
+ 220 CONTINUE
+*
+* If the loop completes, all results are at least half accurate.
+ GO TO 250
+*
+* Report fatal error.
+*
+ 230 FATAL = .TRUE.
+ WRITE( NOUT, FMT = 9999 )
+ DO 240 I = 1, M
+ IF( MV )THEN
+ WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+ ELSE
+ WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+ END IF
+ 240 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9997 )J
+*
+ 250 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+ $ 'F ACCURATE *******', /' EXPECTED RE',
+ $ 'SULT COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) )
+ 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+* End of ZMMCH.
+*
+ END
+ LOGICAL FUNCTION LZE( RI, RJ, LR )
+*
+* Tests if two arrays are identical.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LR
+* .. Array Arguments ..
+ COMPLEX*16 RI( * ), RJ( * )
+* .. Local Scalars ..
+ INTEGER I
+* .. Executable Statements ..
+ DO 10 I = 1, LR
+ IF( RI( I ).NE.RJ( I ) )
+ $ GO TO 20
+ 10 CONTINUE
+ LZE = .TRUE.
+ GO TO 30
+ 20 CONTINUE
+ LZE = .FALSE.
+ 30 RETURN
+*
+* End of LZE.
+*
+ END
+ LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+* Tests if selected elements in two arrays are equal.
+*
+* TYPE is 'ge' or 'he' or 'sy'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LDA, M, N
+ CHARACTER*1 UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX*16 AA( LDA, * ), AS( LDA, * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J
+ LOGICAL UPPER
+* .. Executable Statements ..
+ UPPER = UPLO.EQ.'U'
+ IF( TYPE.EQ.'ge' )THEN
+ DO 20 J = 1, N
+ DO 10 I = M + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 10 CONTINUE
+ 20 CONTINUE
+ ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN
+ DO 50 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IEND = J
+ ELSE
+ IBEG = J
+ IEND = N
+ END IF
+ DO 30 I = 1, IBEG - 1
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 30 CONTINUE
+ DO 40 I = IEND + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 40 CONTINUE
+ 50 CONTINUE
+ END IF
+*
+ 60 CONTINUE
+ LZERES = .TRUE.
+ GO TO 80
+ 70 CONTINUE
+ LZERES = .FALSE.
+ 80 RETURN
+*
+* End of LZERES.
+*
+ END
+ COMPLEX*16 FUNCTION ZBEG( RESET )
+*
+* Generates complex numbers as pairs of random numbers uniformly
+* distributed between -0.5 and 0.5.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ LOGICAL RESET
+* .. Local Scalars ..
+ INTEGER I, IC, J, MI, MJ
+* .. Save statement ..
+ SAVE I, IC, J, MI, MJ
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX
+* .. Executable Statements ..
+ IF( RESET )THEN
+* Initialize local variables.
+ MI = 891
+ MJ = 457
+ I = 7
+ J = 7
+ IC = 0
+ RESET = .FALSE.
+ END IF
+*
+* The sequence of values of I or J is bounded between 1 and 999.
+* If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+* If initial I or J = 4 or 8, the period will be 25.
+* If initial I or J = 5, the period will be 10.
+* IC is used to break up the period by skipping 1 value of I or J
+* in 6.
+*
+ IC = IC + 1
+ 10 I = I*MI
+ J = J*MJ
+ I = I - 1000*( I/1000 )
+ J = J - 1000*( J/1000 )
+ IF( IC.GE.5 )THEN
+ IC = 0
+ GO TO 10
+ END IF
+ ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 )
+ RETURN
+*
+* End of ZBEG.
+*
+ END
+ DOUBLE PRECISION FUNCTION DDIFF( X, Y )
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ DOUBLE PRECISION X, Y
+* .. Executable Statements ..
+ DDIFF = X - Y
+ RETURN
+*
+* End of DDIFF.
+*
+ END
+
diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h
index 53cb99f9e..3eeb46ac2 100644
--- a/ctest/cblas_test.h
+++ b/ctest/cblas_test.h
@@ -173,12 +173,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX;
#define F77_dtrmm cdtrmm_
#define F77_dtrsm cdtrsm_
#define F77_cgemm ccgemm_
+ #define F77_cgemm3m ccgemm3m_
#define F77_csymm ccsymm_
#define F77_csyrk ccsyrk_
#define F77_csyr2k ccsyr2k_
#define F77_ctrmm cctrmm_
#define F77_ctrsm cctrsm_
#define F77_zgemm czgemm_
+ #define F77_zgemm3m czgemm3m_
#define F77_zsymm czsymm_
#define F77_zsyrk czsyrk_
#define F77_zsyr2k czsyr2k_
@@ -333,12 +335,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX;
#define F77_dtrmm CDTRMM
#define F77_dtrsm CDTRSM
#define F77_cgemm CCGEMM
+ #define F77_cgemm3m CCGEMM3M
#define F77_csymm CCSYMM
#define F77_csyrk CCSYRK
#define F77_csyr2k CCSYR2K
#define F77_ctrmm CCTRMM
#define F77_ctrsm CCTRSM
#define F77_zgemm CZGEMM
+ #define F77_zgemm3m CZGEMM3M
#define F77_zsymm CZSYMM
#define F77_zsyrk CZSYRK
#define F77_zsyr2k CZSYR2K
@@ -493,12 +497,14 @@ typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX;
#define F77_dtrmm cdtrmm
#define F77_dtrsm cdtrsm
#define F77_cgemm ccgemm
+ #define F77_cgemm3m ccgemm3m
#define F77_csymm ccsymm
#define F77_csyrk ccsyrk
#define F77_csyr2k ccsyr2k
#define F77_ctrmm cctrmm
#define F77_ctrsm cctrsm
#define F77_zgemm czgemm
+ #define F77_zgemm3m czgemm3m
#define F77_zsymm czsymm
#define F77_zsyrk czsyrk
#define F77_zsyr2k czsyr2k
diff --git a/ctest/cin3_3m b/ctest/cin3_3m
new file mode 100644
index 000000000..34014143e
--- /dev/null
+++ b/ctest/cin3_3m
@@ -0,0 +1,22 @@
+'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO TEST ERROR EXITS.
+2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
+16.0 THRESHOLD VALUE OF TEST RATIO
+6 NUMBER OF VALUES OF N
+0 1 2 3 5 9 35 VALUES OF N
+3 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+3 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+cblas_cgemm3m T PUT F FOR NO TEST. SAME COLUMNS.
+cblas_chemm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_csymm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_ctrmm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_ctrsm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_cherk F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_csyrk F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_cher2k F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_csyr2k F PUT F FOR NO TEST. SAME COLUMNS.
diff --git a/ctest/zin3_3m b/ctest/zin3_3m
new file mode 100644
index 000000000..33bf08353
--- /dev/null
+++ b/ctest/zin3_3m
@@ -0,0 +1,22 @@
+'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F LOGICAL FLAG, T TO STOP ON FAILURES.
+T LOGICAL FLAG, T TO TEST ERROR EXITS.
+2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
+16.0 THRESHOLD VALUE OF TEST RATIO
+7 NUMBER OF VALUES OF N
+0 1 2 3 5 9 35 VALUES OF N
+3 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+3 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+cblas_zgemm3m T PUT F FOR NO TEST. SAME COLUMNS.
+cblas_zhemm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_zsymm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_ztrmm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_ztrsm F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_zherk F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_zsyrk F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_zher2k F PUT F FOR NO TEST. SAME COLUMNS.
+cblas_zsyr2k F PUT F FOR NO TEST. SAME COLUMNS.
diff --git a/driver/level3/Makefile b/driver/level3/Makefile
index d62921e84..352225206 100644
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -4,11 +4,11 @@ include ../../Makefile.system
USE_GEMM3M = 0
ifeq ($(ARCH), x86)
-USE_GEMM3M = 0
+USE_GEMM3M = 1
endif
ifeq ($(ARCH), x86_64)
-USE_GEMM3M = 0
+USE_GEMM3M = 1
endif
ifeq ($(ARCH), ia64)
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index a0a8b5188..f0f889a15 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -251,7 +251,11 @@ void blas_set_parameter(void){
env_var_t p;
int factor;
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
+ int size = 16;
+#else
int size = get_L2_size();
+#endif
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
size >>= 7;
diff --git a/exports/gensymbol b/exports/gensymbol
index 0769ae0f3..bcea83667 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -52,7 +52,9 @@
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
- cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby
+ cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby,
+ cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy,
+ cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy,
);
@exblasobjs = (
@@ -73,7 +75,7 @@
);
@gemm3mobjs = (
-
+ cgemm3m,zgemm3m
);
diff --git a/interface/Makefile b/interface/Makefile
index 6992248ba..54699b7e3 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -4,11 +4,11 @@ include $(TOPDIR)/Makefile.system
SUPPORT_GEMM3M = 0
ifeq ($(ARCH), x86)
-SUPPORT_GEMM3M = 0
+SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), x86_64)
-SUPPORT_GEMM3M = 0
+SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), ia64)
@@ -128,9 +128,11 @@ ZBLAS3OBJS = \
ifeq ($(SUPPORT_GEMM3M), 1)
-CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX)
+# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX)
+CBLAS3OBJS += cgemm3m.$(SUFFIX)
-ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX)
+# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX)
+ZBLAS3OBJS += zgemm3m.$(SUFFIX)
endif
@@ -267,7 +269,7 @@ CSBLAS2OBJS = \
CSBLAS3OBJS = \
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
- cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX)
+ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)
CDBLAS1OBJS = \
cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
@@ -283,7 +285,7 @@ CDBLAS2OBJS = \
CDBLAS3OBJS += \
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
- cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX)
+ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX)
CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
@@ -305,7 +307,9 @@ CCBLAS2OBJS = \
CCBLAS3OBJS = \
cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
- cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX)
+ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
+ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)
+
CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
@@ -327,7 +331,19 @@ CZBLAS2OBJS = \
CZBLAS3OBJS = \
cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
- cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)
+ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
+ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX)
+
+ifeq ($(SUPPORT_GEMM3M), 1)
+
+# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX)
+CCBLAS3OBJS += cblas_cgemm3m.$(SUFFIX)
+
+# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX)
+CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX)
+
+endif
+
ifndef NO_CBLAS
@@ -1771,6 +1787,13 @@ cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c
cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c
$(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F)
+cblas_cgemm3m.$(SUFFIX) cblas_cgemm3m.$(PSUFFIX) : gemm.c
+ $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F)
+
+cblas_zgemm3m.$(SUFFIX) cblas_zgemm3m.$(PSUFFIX) : gemm.c
+ $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F)
+
+
sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c
$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -2035,25 +2058,49 @@ cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c
domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_domatcopy.$(SUFFIX) cblas_domatcopy.$(PSUFFIX) : omatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_somatcopy.$(SUFFIX) cblas_somatcopy.$(PSUFFIX) : omatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_comatcopy.$(SUFFIX) cblas_comatcopy.$(PSUFFIX) : zomatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_zomatcopy.$(SUFFIX) cblas_zomatcopy.$(PSUFFIX) : zomatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_dimatcopy.$(SUFFIX) cblas_dimatcopy.$(PSUFFIX) : imatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_simatcopy.$(SUFFIX) cblas_simatcopy.$(PSUFFIX) : imatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_cimatcopy.$(SUFFIX) cblas_cimatcopy.$(PSUFFIX) : zimatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c
$(CC) -c $(CFLAGS) $< -o $(@F)
+cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c
+ $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
+
diff --git a/interface/gemm.c b/interface/gemm.c
index 74908e842..a5a2b4724 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -405,49 +405,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#ifndef COMPLEX
double MNK = (double) args.m * (double) args.n * (double) args.k;
- if ( MNK <= (16.0 * 1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
- else
- {
- if ( MNK <= (2.0 * 65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
- {
- nthreads_max = 4;
- if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
- {
- nthreads_max = 2;
- if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
- if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
- if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
- }
- else
- {
- if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
- }
- }
- }
#else
double MNK = (double) args.m * (double) args.n * (double) args.k;
- if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
- else
- {
- if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
- {
- nthreads_max = 4;
- if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
- {
- nthreads_max = 2;
- if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
- if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
- if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
- }
- else
- {
- if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
- }
- }
- }
-
#endif
args.common = NULL;
diff --git a/interface/gemv.c b/interface/gemv.c
index 08553ad21..2dd82dce5 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order,
int nthreads_avail = nthreads_max;
double MNK = (double) m * (double) n;
- if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) )
nthreads_max = 1;
if ( nthreads_max > nthreads_avail )
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 3bc886f4f..89f0ec823 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#undef malloc
#undef free
+#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb)
{
@@ -71,6 +72,28 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'R' ) trans = BlasNoTrans;
if ( Trans == 'T' ) trans = BlasTrans;
if ( Trans == 'C' ) trans = BlasTrans;
+#else
+void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb)
+{
+ char Order, Trans;
+ int order=-1,trans=-1;
+ blasint info = -1;
+ FLOAT *b;
+ size_t msize;
+ blasint *lda, *ldb, *rows, *cols;
+ FLOAT *alpha;
+
+ if ( CORDER == CblasColMajor) order = BlasColMajor;
+ if ( CORDER == CblasRowMajor) order = BlasRowMajor;
+ if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans) trans = BlasNoTrans;
+ if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans;
+
+ rows = &crows;
+ cols = &ccols;
+ alpha = &calpha;
+ lda = &clda;
+ ldb = &cldb;
+#endif
if ( order == BlasColMajor)
{
diff --git a/interface/omatcopy.c b/interface/omatcopy.c
index 0c418b3c9..59650cfa0 100644
--- a/interface/omatcopy.c
+++ b/interface/omatcopy.c
@@ -47,6 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BlasNoTrans 0
#define BlasTrans 1
+#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb)
{
@@ -66,7 +67,27 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'R' ) trans = BlasNoTrans;
if ( Trans == 'T' ) trans = BlasTrans;
if ( Trans == 'C' ) trans = BlasTrans;
+#else
+void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, FLOAT *b, blasint cldb)
+{
+ blasint *rows, *cols, *lda, *ldb;
+ FLOAT *alpha;
+ int order=-1,trans=-1;
+ blasint info = -1;
+
+ if ( CORDER == CblasColMajor ) order = BlasColMajor;
+ if ( CORDER == CblasRowMajor ) order = BlasRowMajor;
+ if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans ) trans = BlasNoTrans;
+ if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans;
+
+ rows = &crows;
+ cols = &ccols;
+ lda = &clda;
+ ldb = &cldb;
+ alpha = &calpha;
+
+#endif
if ( order == BlasColMajor)
{
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
diff --git a/interface/zgemv.c b/interface/zgemv.c
index 50513a8e4..704034aaf 100644
--- a/interface/zgemv.c
+++ b/interface/zgemv.c
@@ -238,7 +238,7 @@ void CNAME(enum CBLAS_ORDER order,
int nthreads_avail = nthreads_max;
double MNK = (double) m * (double) n;
- if ( MNK <= (80.0 * 20.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
+ if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) ))
nthreads_max = 1;
if ( nthreads_max > nthreads_avail )
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index 79af6d760..3f273cf13 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -49,6 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BlasTransConj 2
#define BlasConj 3
+
+#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb)
{
@@ -71,6 +73,30 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'C' ) trans = BlasTransConj;
if ( Trans == 'R' ) trans = BlasConj;
+#else
+void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, blasint cldb)
+{
+
+ blasint *rows, *cols, *lda, *ldb;
+ int order=-1,trans=-1;
+ blasint info = -1;
+ FLOAT *b;
+ size_t msize;
+
+ if ( CORDER == CblasColMajor ) order = BlasColMajor;
+ if ( CORDER == CblasRowMajor ) order = BlasRowMajor;
+
+ if ( CTRANS == CblasNoTrans) trans = BlasNoTrans;
+ if ( CTRANS == CblasConjNoTrans ) trans = BlasConj;
+ if ( CTRANS == CblasTrans) trans = BlasTrans;
+ if ( CTRANS == CblasConjTrans) trans = BlasTransConj;
+
+ rows = &crows;
+ cols = &ccols;
+ lda = &clda;
+ ldb = &cldb;
+#endif
+
if ( order == BlasColMajor)
{
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c
index eec4d3c1c..7345633a2 100644
--- a/interface/zomatcopy.c
+++ b/interface/zomatcopy.c
@@ -49,6 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define BlasTransConj 2
#define BlasConj 3
+#ifndef CBLAS
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb)
{
@@ -69,6 +70,26 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha,
if ( Trans == 'C' ) trans = BlasTransConj;
if ( Trans == 'R' ) trans = BlasConj;
+#else
+void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT *alpha, FLOAT *a, blasint clda, FLOAT*b, blasint cldb)
+{
+ blasint *rows, *cols, *lda, *ldb;
+ int order=-1,trans=-1;
+ blasint info = -1;
+
+ if ( CORDER == CblasColMajor ) order = BlasColMajor;
+ if ( CORDER == CblasRowMajor ) order = BlasRowMajor;
+
+ if ( CTRANS == CblasNoTrans) trans = BlasNoTrans;
+ if ( CTRANS == CblasConjNoTrans ) trans = BlasConj;
+ if ( CTRANS == CblasTrans) trans = BlasTrans;
+ if ( CTRANS == CblasConjTrans) trans = BlasTransConj;
+
+ rows = &crows;
+ cols = &ccols;
+ lda = &clda;
+ ldb = &cldb;
+#endif
if ( order == BlasColMajor)
{
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
diff --git a/kernel/arm/symv_L.c b/kernel/arm/symv_L.c
new file mode 100644
index 000000000..8f48d03f5
--- /dev/null
+++ b/kernel/arm/symv_L.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG jx,jy;
+ BLASLONG j;
+ FLOAT temp1;
+ FLOAT temp2;
+
+#if 0
+ if ( m != offset )
+ printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+ jx = 0;
+ jy = 0;
+
+ for (j=0; j<offset; j++)
+ {
+ temp1 = alpha * x[jx];
+ temp2 = 0.0;
+ y[jy] += temp1 * a[j*lda+j];
+ iy = jy;
+ ix = jx;
+ for (i=j+1; i<m; i++)
+ {
+ ix += inc_x;
+ iy += inc_y;
+ y[iy] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[ix];
+
+ }
+ y[jy] += alpha * temp2;
+ jx += inc_x;
+ jy += inc_y;
+ }
+ return(0);
+}
+
+
diff --git a/kernel/arm/symv_U.c b/kernel/arm/symv_U.c
new file mode 100644
index 000000000..b5a0c96e9
--- /dev/null
+++ b/kernel/arm/symv_U.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG jx,jy;
+ BLASLONG j;
+ FLOAT temp1;
+ FLOAT temp2;
+
+#if 0
+ if( m != offset )
+ printf("Symv_U: m=%d offset=%d\n",m,offset);
+#endif
+
+ BLASLONG m1 = m - offset;
+
+ jx = m1 * inc_x;
+ jy = m1 * inc_y;
+
+ for (j=m1; j<m; j++)
+ {
+ temp1 = alpha * x[jx];
+ temp2 = 0.0;
+ iy = 0;
+ ix = 0;
+ for (i=0; i<j; i++)
+ {
+ y[iy] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[ix];
+ ix += inc_x;
+ iy += inc_y;
+
+ }
+ y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
+ jx += inc_x;
+ jy += inc_y;
+ }
+ return(0);
+}
+
+
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index b1beeae5c..0d7bbd4ac 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -293,6 +293,14 @@ gotoblas_t TABLE_NAME = {
#endif
chemm_outcopyTS, chemm_oltcopyTS,
+ 0, 0, 0,
+#ifdef CGEMM3M_DEFAULT_UNROLL_M
+ CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N),
+#else
+ SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
+#endif
+
+
cgemm3m_kernelTS,
cgemm3m_incopybTS, cgemm3m_incopyrTS,
@@ -391,6 +399,14 @@ gotoblas_t TABLE_NAME = {
#endif
zhemm_outcopyTS, zhemm_oltcopyTS,
+ 0, 0, 0,
+#ifdef ZGEMM3M_DEFAULT_UNROLL_M
+ ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N),
+#else
+ DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
+#endif
+
+
zgemm3m_kernelTS,
zgemm3m_incopybTS, zgemm3m_incopyrTS,
@@ -486,6 +502,9 @@ gotoblas_t TABLE_NAME = {
#endif
xhemm_outcopyTS, xhemm_oltcopyTS,
+ 0, 0, 0,
+ QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
+
xgemm3m_kernelTS,
xgemm3m_incopybTS, xgemm3m_incopyrTS,
@@ -661,9 +680,23 @@ static void init_parameter(void) {
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
+
+#ifdef CGEMM3M_DEFAULT_Q
+ TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
+#else
+ TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_Q
+ TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
+#else
+ TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q;
+#endif
+
#ifdef EXPRECISION
TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
+ TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q;
#endif
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
@@ -918,20 +951,56 @@ static void init_parameter(void) {
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+
+
+
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
+
+#endif
+
+
+#ifdef CGEMM3M_DEFAULT_P
+ TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
+#else
+ TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_P
+ TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
+#else
+ TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
+#endif
+
+#ifdef EXPRECISION
+ TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
#endif
+
TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1);
+
+#ifdef CGEMM3M_DEFAULT_UNROLL_M
+ TABLE_NAME.cgemm3m_p = (TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1) & ~(CGEMM3M_DEFAULT_UNROLL_M - 1);
+#else
+ TABLE_NAME.cgemm3m_p = (TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1);
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_UNROLL_M
+ TABLE_NAME.zgemm3m_p = (TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1) & ~(ZGEMM3M_DEFAULT_UNROLL_M - 1);
+#else
+ TABLE_NAME.zgemm3m_p = (TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1);
+#endif
+
#ifdef QUAD_PRECISION
TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1);
TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1);
+ TABLE_NAME.xgemm3m_p = (TABLE_NAME.xgemm3m_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1);
#endif
#ifdef DEBUG
@@ -965,11 +1034,32 @@ static void init_parameter(void) {
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);
+ TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA
+ + TABLE_NAME.align) & ~TABLE_NAME.align)
+ ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15);
+
+ TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA
+ + TABLE_NAME.align) & ~TABLE_NAME.align)
+ ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15);
+
+
+
+
#ifdef EXPRECISION
TABLE_NAME.xgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15);
+
+ TABLE_NAME.xgemm3m_r = (((BUFFER_SIZE -
+ ((TABLE_NAME.xgemm3m_p * TABLE_NAME.xgemm3m_q * 32 + TABLE_NAME.offsetA
+ + TABLE_NAME.align) & ~TABLE_NAME.align)
+ ) / (TABLE_NAME.xgemm3m_q * 32) - 15) & ~15);
+
#endif
+
+
}
diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index 19bf7fd32..289529772 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -1,8 +1,20 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+DAXPYKERNEL = daxpy.c
+CAXPYKERNEL = caxpy.c
+ZAXPYKERNEL = zaxpy.c
+
+SDOTKERNEL = sdot.c
+DDOTKERNEL = ddot.c
+
+DSYMV_U_KERNEL = dsymv_U.c
+DSYMV_L_KERNEL = dsymv_L.c
+SSYMV_U_KERNEL = ssymv_U.c
+SSYMV_L_KERNEL = ssymv_L.c
+
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t.c
+ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index d0ac9c72f..a621b4484 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,14 +1,14 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
-DGEMVNKERNEL = dgemv_n.c
-DGEMVTKERNEL = dgemv_t.c
+DGEMVNKERNEL = dgemv_n_4.c
+DGEMVTKERNEL = dgemv_t_4.c
-ZGEMVNKERNEL = zgemv_n.c
-ZGEMVTKERNEL = zgemv_t.c
+ZGEMVNKERNEL = zgemv_n_4.c
+ZGEMVTKERNEL = zgemv_t_4.c
-CGEMVNKERNEL = cgemv_n.c
-CGEMVTKERNEL = cgemv_t.c
+CGEMVNKERNEL = cgemv_n_4.c
+CGEMVTKERNEL = cgemv_t_4.c
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM
index ca9ff252d..8feef5c31 100644
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -1,5 +1,17 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SAXPYKERNEL = saxpy.c
+DAXPYKERNEL = daxpy.c
+
+SDOTKERNEL = sdot.c
+DDOTKERNEL = ddot.c
+
+DSYMV_U_KERNEL = dsymv_U.c
+DSYMV_L_KERNEL = dsymv_L.c
+SSYMV_U_KERNEL = ssymv_U.c
+SSYMV_L_KERNEL = ssymv_L.c
+
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
+DGEMVNKERNEL = dgemv_n_4.c
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 146a8768b..55285e3d3 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,11 +1,12 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t_4.c
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
+
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S
diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE
index b654d3564..61e13a116 100644
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,7 +1,7 @@
-SGEMVNKERNEL = sgemv_n.c
-SGEMVTKERNEL = sgemv_t.c
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
-ZGEMVNKERNEL = zgemv_n.c
+ZGEMVNKERNEL = zgemv_n_4.c
SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c
new file mode 100644
index 000000000..fa8924ae9
--- /dev/null
+++ b/kernel/x86_64/caxpy.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER)
+#include "caxpy_microk_bulldozer-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG register i = 0;
+ BLASLONG register ix = 0;
+ FLOAT da_r = alpha[0];
+ FLOAT da_i = alpha[1];
+
+
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+ y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+ y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+ ix+=4 ;
+ i+=2 ;
+
+ }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT da[2];
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ int n1 = n & -8;
+
+ if ( n1 )
+ {
+ da[0] = da_r;
+ da[1] = da_i;
+ caxpy_kernel_8(n1, x, y , &da );
+ ix = 2 * n1;
+ }
+ i = n1;
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ i++ ;
+ ix += 2;
+
+ }
+ return(0);
+
+
+ }
+
+ inc_x *=2;
+ inc_y *=2;
+
+ while(i < n)
+ {
+
+#if !defined(CONJ)
+ y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c
new file mode 100644
index 000000000..86407028c
--- /dev/null
+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vbroadcastss (%4), %%xmm0 \n\t" // real part of alpha
+ "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "prefetcht0 768(%2,%0,4) \n\t"
+ "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
+ "vmovups 16(%2,%0,4), %%xmm7 \n\t" // 2 complex values from x
+ "vmovups 32(%2,%0,4), %%xmm9 \n\t" // 2 complex values from x
+ "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 complex values from x
+ "prefetcht0 768(%3,%0,4) \n\t"
+
+#if !defined(CONJ)
+ "vfmaddps (%3,%0,4), %%xmm0 , %%xmm5, %%xmm12 \n\t"
+ "vpermilps $0xb1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
+ "vmulps %%xmm1, %%xmm4 , %%xmm4 \n\t"
+
+ "vfmaddps 16(%3,%0,4), %%xmm0 , %%xmm7, %%xmm13 \n\t"
+ "vpermilps $0xb1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
+ "vmulps %%xmm1, %%xmm6 , %%xmm6 \n\t"
+
+ "vfmaddps 32(%3,%0,4), %%xmm0 , %%xmm9, %%xmm14 \n\t"
+ "vpermilps $0xb1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
+ "vmulps %%xmm1, %%xmm8 , %%xmm8 \n\t"
+
+ "vfmaddps 48(%3,%0,4), %%xmm0 , %%xmm11,%%xmm15 \n\t"
+ "vpermilps $0xb1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
+ "vmulps %%xmm1, %%xmm10, %%xmm10 \n\t"
+
+ "vaddsubps %%xmm4, %%xmm12, %%xmm12 \n\t"
+ "vaddsubps %%xmm6, %%xmm13, %%xmm13 \n\t"
+ "vaddsubps %%xmm8, %%xmm14, %%xmm14 \n\t"
+ "vaddsubps %%xmm10,%%xmm15, %%xmm15 \n\t"
+
+#else
+
+ "vmulps %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
+ "vmulps %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
+ "vmulps %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
+ "vmulps %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
+ "vmulps %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
+ "vmulps %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
+ "vmulps %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
+ "vmulps %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
+
+ "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
+ "vaddsubps %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
+ "vpermilps $0xb1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
+
+ "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
+ "vaddsubps %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
+ "vpermilps $0xb1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
+
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
+ "vaddsubps %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
+
+ "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
+ "vaddsubps %%xmm10,%%xmm11, %%xmm10 \n\t"
+ "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
+
+ "vaddps (%3,%0,4) ,%%xmm4 , %%xmm12 \n\t"
+ "vaddps 16(%3,%0,4) ,%%xmm6 , %%xmm13 \n\t"
+ "vaddps 32(%3,%0,4) ,%%xmm8 , %%xmm14 \n\t"
+ "vaddps 48(%3,%0,4) ,%%xmm10, %%xmm15 \n\t"
+
+
+#endif
+
+ "vmovups %%xmm12, (%3,%0,4) \n\t"
+ "vmovups %%xmm13, 16(%3,%0,4) \n\t"
+ "vmovups %%xmm14, 32(%3,%0,4) \n\t"
+ "vmovups %%xmm15, 48(%3,%0,4) \n\t"
+
+ "addq $16, %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (alpha) // 4
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.S b/kernel/x86_64/cgemm_kernel_8x2_haswell.S
index 98f40054e..a608071db 100644
--- a/kernel/x86_64/cgemm_kernel_8x2_haswell.S
+++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.S
@@ -227,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 )
- addq $6*SIZE, BO
- addq $16*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 16*SIZE, AO
decq %rax
.endm
@@ -356,8 +356,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
- addq $6*SIZE, BO
- addq $8*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 8*SIZE, AO
decq %rax
.endm
@@ -447,8 +447,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
- addq $6*SIZE, BO
- addq $4*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 4*SIZE, AO
decq %rax
.endm
@@ -540,8 +540,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
- addq $6*SIZE, BO
- addq $2*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 2*SIZE, AO
decq %rax
.endm
diff --git a/kernel/x86_64/cgemv_n.c b/kernel/x86_64/cgemv_n.c
deleted file mode 100644
index 47ef0d447..000000000
--- a/kernel/x86_64/cgemv_n.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "common.h"
-
-#if defined(HASWELL)
-#include "cgemv_n_microk_haswell-2.c"
-#endif
-
-
-#define NBMAX 2048
-
-#ifndef HAVE_KERNEL_16x4
-
-static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0,*a1,*a2,*a3;
- a0 = ap[0];
- a1 = ap[1];
- a2 = ap[2];
- a3 = ap[3];
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- y[i] += a0[i]*x[0] - a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
- y[i] += a1[i]*x[2] - a1[i+1] * x[3];
- y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
- y[i] += a2[i]*x[4] - a2[i+1] * x[5];
- y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
- y[i] += a3[i]*x[6] - a3[i+1] * x[7];
- y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
-#else
- y[i] += a0[i]*x[0] + a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
- y[i] += a1[i]*x[2] + a1[i+1] * x[3];
- y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
- y[i] += a2[i]*x[4] + a2[i+1] * x[5];
- y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
- y[i] += a3[i]*x[6] + a3[i+1] * x[7];
- y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
-#endif
- }
-}
-
-#endif
-
-static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0;
- a0 = ap;
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- y[i] += a0[i]*x[0] - a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
-#else
- y[i] += a0[i]*x[0] + a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
-#endif
-
- }
-}
-
-
-static void zero_y(BLASLONG n, FLOAT *dest)
-{
- BLASLONG i;
- for ( i=0; i<2*n; i++ )
- {
- *dest = 0.0;
- dest++;
- }
-}
-
-
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
-{
- BLASLONG i;
- FLOAT temp_r;
- FLOAT temp_i;
- for ( i=0; i<n; i++ )
- {
-#if !defined(XCONJ)
- temp_r = alpha_r * src[0] - alpha_i * src[1];
- temp_i = alpha_r * src[1] + alpha_i * src[0];
-#else
- temp_r = alpha_r * src[0] + alpha_i * src[1];
- temp_i = -alpha_r * src[1] + alpha_i * src[0];
-#endif
-
- *dest += temp_r;
- *(dest+1) += temp_i;
-
- src+=2;
- dest += inc_dest;
- }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- FLOAT *ap[4];
- BLASLONG n1;
- BLASLONG m1;
- BLASLONG m2;
- BLASLONG n2;
- FLOAT xbuffer[8],*ybuffer;
-
-
-#if 0
-printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
-#endif
-
- if ( m < 1 ) return(0);
- if ( n < 1 ) return(0);
-
- ybuffer = buffer;
-
- inc_x *= 2;
- inc_y *= 2;
- lda *= 2;
-
- n1 = n / 4 ;
- n2 = n % 4 ;
-
- m1 = m - ( m % 16 );
- m2 = (m % NBMAX) - (m % 16) ;
-
- y_ptr = y;
-
- BLASLONG NB = NBMAX;
-
- while ( NB == NBMAX )
- {
-
- m1 -= NB;
- if ( m1 < 0)
- {
- if ( m2 == 0 ) break;
- NB = m2;
- }
-
- a_ptr = a;
- x_ptr = x;
- zero_y(NB,ybuffer);
- for( i = 0; i < n1 ; i++)
- {
-
- xbuffer[0] = x_ptr[0];
- xbuffer[1] = x_ptr[1];
- x_ptr += inc_x;
- xbuffer[2] = x_ptr[0];
- xbuffer[3] = x_ptr[1];
- x_ptr += inc_x;
- xbuffer[4] = x_ptr[0];
- xbuffer[5] = x_ptr[1];
- x_ptr += inc_x;
- xbuffer[6] = x_ptr[0];
- xbuffer[7] = x_ptr[1];
- x_ptr += inc_x;
-
- ap[0] = a_ptr;
- ap[1] = a_ptr + lda;
- ap[2] = ap[1] + lda;
- ap[3] = ap[2] + lda;
- cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
- a_ptr += 4 * lda;
- }
-
- for( i = 0; i < n2 ; i++)
- {
- xbuffer[0] = x_ptr[0];
- xbuffer[1] = x_ptr[1];
- x_ptr += inc_x;
- cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
- a_ptr += 1 * lda;
-
- }
- add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
- a += 2 * NB;
- y_ptr += NB * inc_y;
- }
-
- j=0;
- while ( j < (m % 16))
- {
- a_ptr = a;
- x_ptr = x;
- FLOAT temp_r = 0.0;
- FLOAT temp_i = 0.0;
- for( i = 0; i < n; i++ )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
- temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-#else
- temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
- temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-#endif
-
- a_ptr += lda;
- x_ptr += inc_x;
- }
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
- y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
- y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
- y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
- y_ptr += inc_y;
- a+=2;
- j++;
- }
- return(0);
-}
-
-
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
new file mode 100644
index 000000000..ff8058549
--- /dev/null
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -0,0 +1,623 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h"
+
+#if defined(HASWELL)
+#include "cgemv_n_microk_haswell-4.c"
+#endif
+
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x4
+
+static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ y[i] += a0[i]*x[0] - a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] - a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
+ y[i] += a2[i]*x[4] - a2[i+1] * x[5];
+ y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
+ y[i] += a3[i]*x[6] - a3[i+1] * x[7];
+ y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
+#else
+ y[i] += a0[i]*x[0] + a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] + a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
+ y[i] += a2[i]*x[4] + a2[i+1] * x[5];
+ y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
+ y[i] += a3[i]*x[6] + a3[i+1] * x[7];
+ y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
+#endif
+ }
+}
+
+#endif
+
+
+
+#ifndef HAVE_KERNEL_4x2
+
+static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1;
+ a0 = ap[0];
+ a1 = ap[1];
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ y[i] += a0[i]*x[0] - a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] - a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
+#else
+ y[i] += a0[i]*x[0] + a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] + a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
+#endif
+ }
+}
+
+#endif
+
+
+
+
+#ifndef HAVE_KERNEL_4x1
+
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0;
+ a0 = ap;
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ y[i] += a0[i]*x[0] - a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+#else
+ y[i] += a0[i]*x[0] + a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+#endif
+
+ }
+}
+
+
+#endif
+
+
+#ifndef HAVE_KERNEL_ADDY
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+ BLASLONG i;
+
+ if ( inc_dest != 2 )
+ {
+
+ FLOAT temp_r;
+ FLOAT temp_i;
+ for ( i=0; i<n; i++ )
+ {
+#if !defined(XCONJ)
+ temp_r = alpha_r * src[0] - alpha_i * src[1];
+ temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+ temp_r = alpha_r * src[0] + alpha_i * src[1];
+ temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+ *dest += temp_r;
+ *(dest+1) += temp_i;
+
+ src+=2;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ FLOAT temp_r0;
+ FLOAT temp_i0;
+ FLOAT temp_r1;
+ FLOAT temp_i1;
+ FLOAT temp_r2;
+ FLOAT temp_i2;
+ FLOAT temp_r3;
+ FLOAT temp_i3;
+ for ( i=0; i<n; i+=4 )
+ {
+#if !defined(XCONJ)
+ temp_r0 = alpha_r * src[0] - alpha_i * src[1];
+ temp_i0 = alpha_r * src[1] + alpha_i * src[0];
+ temp_r1 = alpha_r * src[2] - alpha_i * src[3];
+ temp_i1 = alpha_r * src[3] + alpha_i * src[2];
+ temp_r2 = alpha_r * src[4] - alpha_i * src[5];
+ temp_i2 = alpha_r * src[5] + alpha_i * src[4];
+ temp_r3 = alpha_r * src[6] - alpha_i * src[7];
+ temp_i3 = alpha_r * src[7] + alpha_i * src[6];
+#else
+ temp_r0 = alpha_r * src[0] + alpha_i * src[1];
+ temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
+ temp_r1 = alpha_r * src[2] + alpha_i * src[3];
+ temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
+ temp_r2 = alpha_r * src[4] + alpha_i * src[5];
+ temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
+ temp_r3 = alpha_r * src[6] + alpha_i * src[7];
+ temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
+#endif
+
+ dest[0] += temp_r0;
+ dest[1] += temp_i0;
+ dest[2] += temp_r1;
+ dest[3] += temp_i1;
+ dest[4] += temp_r2;
+ dest[5] += temp_i2;
+ dest[6] += temp_r3;
+ dest[7] += temp_i3;
+
+ src += 8;
+ dest += 8;
+ }
+ return;
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ FLOAT *ap[4];
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ BLASLONG lda4;
+ FLOAT xbuffer[8],*ybuffer;
+
+
+#if 0
+printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
+#endif
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ ybuffer = buffer;
+
+ inc_x *= 2;
+ inc_y *= 2;
+ lda *= 2;
+ lda4 = 4 * lda;
+
+ n1 = n / 4 ;
+ n2 = n % 4 ;
+
+ m3 = m % 4;
+ m1 = m - ( m % 4 );
+ m2 = (m % NBMAX) - (m % 4) ;
+
+ y_ptr = y;
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ a_ptr = a;
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+ x_ptr = x;
+ //zero_y(NB,ybuffer);
+ memset(ybuffer,0,NB*8);
+
+ if ( inc_x == 2 )
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ cgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ x_ptr += 8;
+ }
+
+ if ( n2 & 2 )
+ {
+ cgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
+ x_ptr += 4;
+ a_ptr += 2 * lda;
+
+ }
+
+ if ( n2 & 1 )
+ {
+ cgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
+ x_ptr += 2;
+ a_ptr += lda;
+
+ }
+ }
+ else
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+
+ xbuffer[0] = x_ptr[0];
+ xbuffer[1] = x_ptr[1];
+ x_ptr += inc_x;
+ xbuffer[2] = x_ptr[0];
+ xbuffer[3] = x_ptr[1];
+ x_ptr += inc_x;
+ xbuffer[4] = x_ptr[0];
+ xbuffer[5] = x_ptr[1];
+ x_ptr += inc_x;
+ xbuffer[6] = x_ptr[0];
+ xbuffer[7] = x_ptr[1];
+ x_ptr += inc_x;
+
+ cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ }
+
+ for( i = 0; i < n2 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ xbuffer[1] = x_ptr[1];
+ x_ptr += inc_x;
+ cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+ a_ptr += 1 * lda;
+
+ }
+
+ }
+
+ add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
+ a += 2 * NB;
+ y_ptr += NB * inc_y;
+ }
+
+ if ( m3 == 0 ) return(0);
+
+ if ( m3 == 1 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp_r = 0.0;
+ FLOAT temp_i = 0.0;
+
+ if ( lda == 2 && inc_x == 2 )
+ {
+
+
+ for( i=0 ; i < (n & -2); i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+ temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+ temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+ a_ptr += 4;
+ x_ptr += 4;
+ }
+
+
+
+ for( ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+ a_ptr += 2;
+ x_ptr += 2;
+ }
+
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+ }
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+ return(0);
+ }
+
+ if ( m3 == 2 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_i1 = 0.0;
+
+ if ( lda == 4 && inc_x == 2 )
+ {
+
+ for( i = 0; i < (n & -2); i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+ temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+ temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+ temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+ temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+ temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+ temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+ temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+ temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+
+#endif
+
+ a_ptr += 8;
+ x_ptr += 4;
+ }
+
+
+ for( ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+ a_ptr += 4;
+ x_ptr += 2;
+ }
+
+
+ }
+ else
+ {
+
+ for( i=0 ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+
+ }
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+ y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+ return(0);
+ }
+
+
+ if ( m3 == 3 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_i1 = 0.0;
+ FLOAT temp_r2 = 0.0;
+ FLOAT temp_i2 = 0.0;
+
+ if ( lda == 6 && inc_x == 2 )
+ {
+
+ for( i=0 ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+ a_ptr += 6;
+ x_ptr += 2;
+ }
+
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+ }
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+ y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+ y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+ y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+ return(0);
+ }
+
+
+
+
+
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/cgemv_n_microk_haswell-2.c b/kernel/x86_64/cgemv_n_microk_haswell-2.c
deleted file mode 100644
index 9b1501013..000000000
--- a/kernel/x86_64/cgemv_n_microk_haswell-2.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
-
- "vbroadcastss (%2), %%ymm0 \n\t" // real part x0
- "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
- "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
- "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
- "vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2
- "vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2
- "vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3
- "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
-
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "prefetcht0 320(%4,%0,4) \n\t"
- "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
- "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
-
- "prefetcht0 320(%5,%0,4) \n\t"
- "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
- "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
-
- "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "prefetcht0 320(%6,%0,4) \n\t"
- "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
- "vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2
-
- "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "prefetcht0 320(%7,%0,4) \n\t"
- "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
- "vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3
-
- "vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vfmadd231ps %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vfmadd231ps %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vfmadd231ps %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vfmadd231ps %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "prefetcht0 320(%3,%0,4) \n\t"
- "vmovups (%3,%0,4), %%ymm10 \n\t"
- "vmovups 32(%3,%0,4), %%ymm11 \n\t"
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
- "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
- "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
- "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
-#else
- "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
- "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
- "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
- "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
- "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
- "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
-#endif
-
- "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
- "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
-
- "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
- "vmovups %%ymm13, 32(%3,%0,4) \n\t"
-
- "addq $16, %0 \n\t"
- "subq $8 , %1 \n\t"
- "jnz .L01LOOP%= \n\t"
- "vzeroupper \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/cgemv_n_microk_haswell-4.c b/kernel/x86_64/cgemv_n_microk_haswell-4.c
new file mode 100644
index 000000000..24417ba36
--- /dev/null
+++ b/kernel/x86_64/cgemv_n_microk_haswell-4.c
@@ -0,0 +1,542 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastss (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
+ "vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2
+ "vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2
+ "vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3
+ "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3
+
+ "cmpq $0 , %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 320(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
+
+ "prefetcht0 320(%5,%0,4) \n\t"
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+ "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 320(%6,%0,4) \n\t"
+ "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
+ "vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2
+
+ "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 320(%7,%0,4) \n\t"
+ "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
+ "vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3
+
+ "vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231ps %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231ps %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231ps %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231ps %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 320(%3,%0,4) \n\t"
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,4), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,4) \n\t"
+
+ "addq $16, %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "cmpq $4, %8 \n\t"
+ "jne .L02END%= \n\t"
+
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2
+ "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3
+
+ "vfmadd231ps %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmadd231ps %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+
+ ".L02END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (n2) // 8
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x2 1
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastss (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1
+
+ "cmpq $0 , %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 320(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
+
+ "prefetcht0 320(%5,%0,4) \n\t"
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+ "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231ps %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231ps %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 320(%3,%0,4) \n\t"
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,4), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,4) \n\t"
+
+ "addq $16, %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "cmpq $4, %6 \n\t"
+ "jne .L02END%= \n\t"
+
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vfmadd231ps %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231ps %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+
+ ".L02END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (n2) // 6
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x1 1
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastss (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0
+
+ "cmpq $0 , %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 320(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 320(%3,%0,4) \n\t"
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,4), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "addq $16, %0 \n\t"
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "subq $8 , %1 \n\t"
+ "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
+ "vmovups %%ymm13,-32(%3,%0,4) \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "cmpq $4, %5 \n\t"
+ "jne .L02END%= \n\t"
+
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+
+ ".L02END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap), // 4
+ "r" (n2) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_ADDY 1
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+ BLASLONG i;
+
+ if ( inc_dest != 2 )
+ {
+
+ FLOAT temp_r;
+ FLOAT temp_i;
+ for ( i=0; i<n; i++ )
+ {
+#if !defined(XCONJ)
+ temp_r = alpha_r * src[0] - alpha_i * src[1];
+ temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+ temp_r = alpha_r * src[0] + alpha_i * src[1];
+ temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+ *dest += temp_r;
+ *(dest+1) += temp_i;
+
+ src+=2;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ i=0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastss (%4), %%ymm0 \n\t" // alpha_r
+ "vbroadcastss (%5), %%ymm1 \n\t" // alpha_i
+
+ "cmpq $0 , %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
+ "vmovups 32(%2,%0,4), %%ymm9 \n\t"
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t" // 4 complex values from dest
+ "vmovups 32(%3,%0,4), %%ymm11 \n\t"
+
+#if !defined(XCONJ)
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "addq $16, %0 \n\t"
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "subq $8 , %1 \n\t"
+ "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
+ "vmovups %%ymm13,-32(%3,%0,4) \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "cmpq $4, %6 \n\t"
+ "jne .L02END%= \n\t"
+
+ "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src
+
+ "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+ "vmovups (%3,%0,4), %%ymm10 \n\t"
+
+#if !defined(XCONJ)
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y
+
+ ".L02END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (src), // 2
+ "r" (dest), // 3
+ "r" (&alpha_r), // 4
+ "r" (&alpha_i), // 5
+ "r" (n2) // 6
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+ return;
+
+}
+
diff --git a/kernel/x86_64/cgemv_t.c b/kernel/x86_64/cgemv_t.c
deleted file mode 100644
index e40fd349e..000000000
--- a/kernel/x86_64/cgemv_t.c
+++ /dev/null
@@ -1,265 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(HASWELL)
-#include "cgemv_t_microk_haswell-2.c"
-#endif
-
-#define NBMAX 2048
-
-#ifndef HAVE_KERNEL_16x4
-
-static void cgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0,*a1,*a2,*a3;
- a0 = ap[0];
- a1 = ap[1];
- a2 = ap[2];
- a3 = ap[3];
- FLOAT temp_r0 = 0.0;
- FLOAT temp_r1 = 0.0;
- FLOAT temp_r2 = 0.0;
- FLOAT temp_r3 = 0.0;
- FLOAT temp_i0 = 0.0;
- FLOAT temp_i1 = 0.0;
- FLOAT temp_i2 = 0.0;
- FLOAT temp_i3 = 0.0;
-
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
- temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
- temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
- temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
- temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1];
- temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];
- temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1];
- temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];
-#else
- temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
- temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
- temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
- temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
- temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1];
- temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];
- temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1];
- temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];
-#endif
- }
- y[0] = temp_r0;
- y[1] = temp_i0;
- y[2] = temp_r1;
- y[3] = temp_i1;
- y[4] = temp_r2;
- y[5] = temp_i2;
- y[6] = temp_r3;
- y[7] = temp_i3;
-}
-
-#endif
-
-static void cgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0;
- a0 = ap;
- FLOAT temp_r = 0.0;
- FLOAT temp_i = 0.0;
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r += a0[i]*x[i] - a0[i+1]*x[i+1];
- temp_i += a0[i]*x[i+1] + a0[i+1]*x[i];
-#else
- temp_r += a0[i]*x[i] + a0[i+1]*x[i+1];
- temp_i += a0[i]*x[i+1] - a0[i+1]*x[i];
-#endif
- }
- *y = temp_r;
- *(y+1) = temp_i;
-}
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
- BLASLONG i;
- for ( i=0; i<n; i++ )
- {
- *dest = *src;
- *(dest+1) = *(src+1);
- dest+=2;
- src += inc_src;
- }
-}
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- FLOAT *ap[8];
- BLASLONG n1;
- BLASLONG m1;
- BLASLONG m2;
- BLASLONG n2;
- FLOAT ybuffer[8],*xbuffer;
-
- inc_x *= 2;
- inc_y *= 2;
- lda *= 2;
-
- xbuffer = buffer;
-
- n1 = n / 4 ;
- n2 = n % 4 ;
-
- m1 = m - ( m % 16 );
- m2 = (m % NBMAX) - (m % 16) ;
-
-
- BLASLONG NB = NBMAX;
-
- while ( NB == NBMAX )
- {
-
- m1 -= NB;
- if ( m1 < 0)
- {
- if ( m2 == 0 ) break;
- NB = m2;
- }
-
- y_ptr = y;
- a_ptr = a;
- x_ptr = x;
- copy_x(NB,x_ptr,xbuffer,inc_x);
- for( i = 0; i < n1 ; i++)
- {
- ap[0] = a_ptr;
- ap[1] = a_ptr + lda;
- ap[2] = ap[1] + lda;
- ap[3] = ap[2] + lda;
- cgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
- a_ptr += 4 * lda;
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
- y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[2] - alpha_i * ybuffer[3];
- y_ptr[1] += alpha_r * ybuffer[3] + alpha_i * ybuffer[2];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[4] - alpha_i * ybuffer[5];
- y_ptr[1] += alpha_r * ybuffer[5] + alpha_i * ybuffer[4];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[6] - alpha_i * ybuffer[7];
- y_ptr[1] += alpha_r * ybuffer[7] + alpha_i * ybuffer[6];
- y_ptr += inc_y;
-#else
- y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
- y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[2] + alpha_i * ybuffer[3];
- y_ptr[1] -= alpha_r * ybuffer[3] - alpha_i * ybuffer[2];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[4] + alpha_i * ybuffer[5];
- y_ptr[1] -= alpha_r * ybuffer[5] - alpha_i * ybuffer[4];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[6] + alpha_i * ybuffer[7];
- y_ptr[1] -= alpha_r * ybuffer[7] - alpha_i * ybuffer[6];
- y_ptr += inc_y;
-#endif
- }
-
- for( i = 0; i < n2 ; i++)
- {
- cgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
- a_ptr += 1 * lda;
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
- y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
- y_ptr += inc_y;
-#else
- y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
- y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
- y_ptr += inc_y;
-#endif
-
- }
- a += 2* NB;
- x += NB * inc_x;
- }
-
- BLASLONG m3 = m % 16;
- if ( m3 == 0 ) return(0);
-
- x_ptr = x;
- copy_x(m3,x_ptr,xbuffer,inc_x);
- j=0;
- a_ptr = a;
- y_ptr = y;
- while ( j < n)
- {
- FLOAT temp_r = 0.0;
- FLOAT temp_i = 0.0;
- for( i = 0; i < m3*2; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r += a_ptr[i] * xbuffer[i] - a_ptr[i+1] * xbuffer[i+1];
- temp_i += a_ptr[i] * xbuffer[i+1] + a_ptr[i+1] * xbuffer[i];
-#else
- temp_r += a_ptr[i] * xbuffer[i] + a_ptr[i+1] * xbuffer[i+1];
- temp_i += a_ptr[i] * xbuffer[i+1] - a_ptr[i+1] * xbuffer[i];
-#endif
- }
- a_ptr += lda;
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
- y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
- y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
- y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
- y_ptr += inc_y;
- j++;
- }
- return(0);
-}
-
-
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
new file mode 100644
index 000000000..b383a4869
--- /dev/null
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -0,0 +1,579 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(HASWELL)
+#include "cgemv_t_microk_haswell-4.c"
+#endif
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x4
+
+static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+ FLOAT alpha_r = alpha[0];
+ FLOAT alpha_i = alpha[1];
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_r2 = 0.0;
+ FLOAT temp_r3 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_i1 = 0.0;
+ FLOAT temp_i2 = 0.0;
+ FLOAT temp_i3 = 0.0;
+
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
+ temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1];
+ temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];
+ temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1];
+ temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];
+#else
+ temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
+ temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1];
+ temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];
+ temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1];
+ temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];
+#endif
+ }
+
+#if !defined(XCONJ)
+
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+ y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+ y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+ y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+ y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+ y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+ y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+ y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+ y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+
+#endif
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1;
+ a0 = ap[0];
+ a1 = ap[1];
+ FLOAT alpha_r = alpha[0];
+ FLOAT alpha_i = alpha[1];
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_i1 = 0.0;
+
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
+#else
+ temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
+#endif
+ }
+
+#if !defined(XCONJ)
+
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+
+#else
+
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+
+#endif
+}
+
+#endif
+
+
+#ifndef HAVE_KERNEL_4x1
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0;
+ a0 = ap;
+ FLOAT alpha_r = alpha[0];
+ FLOAT alpha_i = alpha[1];
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_i0 = 0.0;
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
+#else
+ temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
+#endif
+ }
+
+#if !defined(XCONJ)
+
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+
+#else
+
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+
+#endif
+
+
+}
+
+#endif
+
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+ BLASLONG i;
+ for ( i=0; i<n; i++ )
+ {
+ *dest = *src;
+ *(dest+1) = *(src+1);
+ dest+=2;
+ src += inc_src;
+ }
+}
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ FLOAT *ap[8];
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ BLASLONG lda4;
+ FLOAT ybuffer[8],*xbuffer;
+ FLOAT alpha[2];
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ inc_x <<= 1;
+ inc_y <<= 1;
+ lda <<= 1;
+ lda4 = lda << 2;
+
+ xbuffer = buffer;
+
+ n1 = n >> 2 ;
+ n2 = n & 3 ;
+
+ m3 = m & 3 ;
+ m1 = m - m3;
+ m2 = (m & (NBMAX-1)) - m3 ;
+
+ alpha[0] = alpha_r;
+ alpha[1] = alpha_i;
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ y_ptr = y;
+ a_ptr = a;
+ x_ptr = x;
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+ if ( inc_x != 2 )
+ copy_x(NB,x_ptr,xbuffer,inc_x);
+ else
+ xbuffer = x_ptr;
+
+ if ( inc_y == 2 )
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ y_ptr += 8;
+
+ }
+
+ if ( n2 & 2 )
+ {
+ cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha);
+ a_ptr += lda * 2;
+ y_ptr += 4;
+
+ }
+
+ if ( n2 & 1 )
+ {
+ cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha);
+ a_ptr += lda;
+ y_ptr += 2;
+
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ memset(ybuffer,0,32);
+ cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+
+ y_ptr[0] += ybuffer[0];
+ y_ptr[1] += ybuffer[1];
+ y_ptr += inc_y;
+ y_ptr[0] += ybuffer[2];
+ y_ptr[1] += ybuffer[3];
+ y_ptr += inc_y;
+ y_ptr[0] += ybuffer[4];
+ y_ptr[1] += ybuffer[5];
+ y_ptr += inc_y;
+ y_ptr[0] += ybuffer[6];
+ y_ptr[1] += ybuffer[7];
+ y_ptr += inc_y;
+
+ }
+
+ for( i = 0; i < n2 ; i++)
+ {
+ memset(ybuffer,0,32);
+ cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
+ a_ptr += lda;
+ y_ptr[0] += ybuffer[0];
+ y_ptr[1] += ybuffer[1];
+ y_ptr += inc_y;
+
+ }
+
+ }
+ a += 2 * NB;
+ x += NB * inc_x;
+ }
+
+
+
+ if ( m3 == 0 ) return(0);
+
+ x_ptr = x;
+ j=0;
+ a_ptr = a;
+ y_ptr = y;
+
+ if ( m3 == 3 )
+ {
+
+ FLOAT temp_r ;
+ FLOAT temp_i ;
+ FLOAT x0 = x_ptr[0];
+ FLOAT x1 = x_ptr[1];
+ x_ptr += inc_x;
+ FLOAT x2 = x_ptr[0];
+ FLOAT x3 = x_ptr[1];
+ x_ptr += inc_x;
+ FLOAT x4 = x_ptr[0];
+ FLOAT x5 = x_ptr[1];
+ while ( j < n)
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+ temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
+ temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+ temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
+ temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j++;
+ }
+ return(0);
+ }
+
+
+ if ( m3 == 2 )
+ {
+
+ FLOAT temp_r ;
+ FLOAT temp_i ;
+ FLOAT temp_r1 ;
+ FLOAT temp_i1 ;
+ FLOAT x0 = x_ptr[0];
+ FLOAT x1 = x_ptr[1];
+ x_ptr += inc_x;
+ FLOAT x2 = x_ptr[0];
+ FLOAT x3 = x_ptr[1];
+ FLOAT ar = alpha[0];
+ FLOAT ai = alpha[1];
+
+ while ( j < ( n & -2 ))
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+ y_ptr[1] += ar * temp_i1 + ai * temp_r1;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+ y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j+=2;
+ }
+
+
+ while ( j < n)
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j++;
+ }
+
+ return(0);
+ }
+
+
+ if ( m3 == 1 )
+ {
+
+ FLOAT temp_r ;
+ FLOAT temp_i ;
+ FLOAT temp_r1 ;
+ FLOAT temp_i1 ;
+ FLOAT x0 = x_ptr[0];
+ FLOAT x1 = x_ptr[1];
+ FLOAT ar = alpha[0];
+ FLOAT ai = alpha[1];
+
+ while ( j < ( n & -2 ))
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+ y_ptr[1] += ar * temp_i1 + ai * temp_r1;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+ y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j+=2;
+ }
+
+ while ( j < n)
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j++;
+ }
+ return(0);
+ }
+
+ return(0);
+
+
+}
+
+
diff --git a/kernel/x86_64/cgemv_t_microk_haswell-2.c b/kernel/x86_64/cgemv_t_microk_haswell-2.c
deleted file mode 100644
index 0d79714af..000000000
--- a/kernel/x86_64/cgemv_t_microk_haswell-2.c
+++ /dev/null
@@ -1,171 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary froms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary from must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void cgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
- "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
- "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
- "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
- "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
- "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
- "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "prefetcht0 192(%4,%0,4) \n\t"
- "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
- "prefetcht0 192(%5,%0,4) \n\t"
- "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
-
- "prefetcht0 192(%2,%0,4) \n\t"
- "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
- "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
- "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
- "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
-
- "prefetcht0 192(%6,%0,4) \n\t"
- "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
- "prefetcht0 192(%7,%0,4) \n\t"
- "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
-
- "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
-
- "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 2 complex values from a0
- "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 2 complex values from a1
-
- "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
- "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
- "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
- "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
-
- "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 2 complex values from a2
- "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 2 complex values from a3
-
- "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
-
- "addq $16 , %0 \n\t"
- "subq $8 , %1 \n\t"
- "jnz .L01LOOP%= \n\t"
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
- "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
- "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
- "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
- "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
- "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
- "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t"
- "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t"
-#else
- "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
- "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
- "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
- "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
- "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
- "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
- "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t"
- "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t"
- "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
- "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
- "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
- "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
-#endif
-
- "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
- "vextractf128 $1, %%ymm10, %%xmm11 \n\t"
- "vextractf128 $1, %%ymm12, %%xmm13 \n\t"
- "vextractf128 $1, %%ymm14, %%xmm15 \n\t"
-
- "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
- "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
- "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
- "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
-
- "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
- "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
- "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t"
- "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t"
-
- "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
- "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
- "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
- "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
-
- "vmovsd %%xmm8 , (%3) \n\t"
- "vmovsd %%xmm10, 8(%3) \n\t"
- "vmovsd %%xmm12, 16(%3) \n\t"
- "vmovsd %%xmm14, 24(%3) \n\t"
-
- "vzeroupper \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/cgemv_t_microk_haswell-4.c b/kernel/x86_64/cgemv_t_microk_haswell-4.c
new file mode 100644
index 000000000..2c506c9e9
--- /dev/null
+++ b/kernel/x86_64/cgemv_t_microk_haswell-4.c
@@ -0,0 +1,539 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary froms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary from must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
+ "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
+ "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
+ "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
+ "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
+ "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
+ "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
+ "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+ "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
+
+ "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+ "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
+ "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+ "cmpq $0, %1 \n\t"
+ "je .L08END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 192(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+ "prefetcht0 192(%5,%0,4) \n\t"
+ "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
+
+ "prefetcht0 192(%2,%0,4) \n\t"
+ "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+ "prefetcht0 192(%6,%0,4) \n\t"
+ "vmovups (%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
+ "prefetcht0 192(%7,%0,4) \n\t"
+ "vmovups (%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+ "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
+
+ "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+ "vmovups 32(%6,%0,4), %%ymm6 \n\t" // 4 complex values from a2
+ "vmovups 32(%7,%0,4), %%ymm7 \n\t" // 4 complex values from a3
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $16 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L08END%=: \n\t"
+
+ "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
+ "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
+ "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t"
+ "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
+ "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
+ "vaddsubps %%ymm13, %%ymm12, %%ymm12 \n\t"
+ "vaddsubps %%ymm15, %%ymm14, %%ymm14 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
+ "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
+ "vaddsubps %%ymm12, %%ymm13, %%ymm12 \n\t"
+ "vaddsubps %%ymm14, %%ymm15, %%ymm14 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
+ "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t"
+ "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t"
+#endif
+
+ "vmovsd (%3), %%xmm4 \n\t" // read y
+ "vmovsd 8(%3), %%xmm5 \n\t"
+ "vmovsd 16(%3), %%xmm6 \n\t"
+ "vmovsd 24(%3), %%xmm7 \n\t"
+
+ "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
+ "vextractf128 $1, %%ymm10, %%xmm11 \n\t"
+ "vextractf128 $1, %%ymm12, %%xmm13 \n\t"
+ "vextractf128 $1, %%ymm14, %%xmm15 \n\t"
+
+ "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
+ "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
+
+ "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
+ "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
+ "vshufpd $0x1, %%xmm12, %%xmm12, %%xmm13 \n\t"
+ "vshufpd $0x1, %%xmm14, %%xmm14, %%xmm15 \n\t"
+
+ "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vaddps %%xmm12, %%xmm13, %%xmm12 \n\t"
+ "vaddps %%xmm14, %%xmm15, %%xmm14 \n\t"
+
+
+ "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulps %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulps %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulps %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulps %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t"
+ "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t"
+ "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t"
+ "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t"
+ "vaddsubps %%xmm13, %%xmm12, %%xmm12 \n\t"
+ "vaddsubps %%xmm15, %%xmm14, %%xmm14 \n\t"
+#else
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
+ "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t"
+ "vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t"
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t"
+#endif
+
+
+ "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
+ "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t"
+ "vaddps %%xmm12, %%xmm6 , %%xmm12 \n\t"
+ "vaddps %%xmm14, %%xmm7 , %%xmm14 \n\t"
+
+ "vmovsd %%xmm8 , (%3) \n\t"
+ "vmovsd %%xmm10, 8(%3) \n\t"
+ "vmovsd %%xmm12, 16(%3) \n\t"
+ "vmovsd %%xmm14, 24(%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x2 1
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
+ "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
+ "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
+ "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+ "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
+
+ "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+ "cmpq $0, %1 \n\t"
+ "je .L08END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 192(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+ "prefetcht0 192(%5,%0,4) \n\t"
+ "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
+
+ "prefetcht0 192(%2,%0,4) \n\t"
+ "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+ "vmovups 32(%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
+
+ "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231ps %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $16 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L08END%=: \n\t"
+
+ "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
+ "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+ "vpermilps $0xb1 , %%ymm11, %%ymm11 \n\t"
+ "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
+ "vaddsubps %%ymm11, %%ymm10, %%ymm10 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
+ "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
+ "vaddsubps %%ymm10, %%ymm11, %%ymm10 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm10, %%ymm10 \n\t"
+#endif
+
+ "vmovsd (%3), %%xmm4 \n\t" // read y
+ "vmovsd 8(%3), %%xmm5 \n\t"
+
+ "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
+ "vextractf128 $1, %%ymm10, %%xmm11 \n\t"
+
+ "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
+
+ "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
+ "vshufpd $0x1, %%xmm10, %%xmm10, %%xmm11 \n\t"
+
+ "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddps %%xmm10, %%xmm11, %%xmm10 \n\t"
+
+ "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulps %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulps %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilps $0xb1 , %%xmm11, %%xmm11 \n\t"
+ "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubps %%xmm11, %%xmm10, %%xmm10 \n\t"
+#else
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
+ "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilps $0xb1 , %%xmm10, %%xmm10 \n\t"
+#endif
+
+
+ "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
+ "vaddps %%xmm10, %%xmm5 , %%xmm10 \n\t"
+
+ "vmovsd %%xmm8 , (%3) \n\t"
+ "vmovsd %%xmm10, 8(%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (alpha) // 6
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x1 1
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
+ "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+
+ "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+ "cmpq $0, %1 \n\t"
+ "je .L08END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 192(%4,%0,4) \n\t"
+ "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+
+ "prefetcht0 192(%2,%0,4) \n\t"
+ "vmovups (%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "vmovups 32(%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
+
+ "vmovups 32(%2,%0,4) , %%ymm6 \n\t" // 4 complex values from x
+ "vpermilps $0xb1, %%ymm6, %%ymm7 \n\t" // exchange real and imap parts
+ "vblendps $0x55, %%ymm6, %%ymm7, %%ymm0 \n\t" // only the real parts
+ "vblendps $0x55, %%ymm7, %%ymm6, %%ymm1 \n\t" // only the imag parts
+
+ "vfmadd231ps %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231ps %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $16 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L08END%=: \n\t"
+
+ "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
+ "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t"
+ "vaddsubps %%ymm9 , %%ymm8, %%ymm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+ "vaddsubps %%ymm8 , %%ymm9 , %%ymm8 \n\t"
+ "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vmovsd (%3), %%xmm4 \n\t" // read y
+
+ "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
+
+ "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+
+ "vshufpd $0x1, %%xmm8 , %%xmm8 , %%xmm9 \n\t"
+
+ "vaddps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+
+ "vmulps %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilps $0xb1 , %%xmm9 , %%xmm9 \n\t"
+ "vaddsubps %%xmm9 , %%xmm8, %%xmm8 \n\t"
+#else
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
+ "vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vpermilps $0xb1 , %%xmm8 , %%xmm8 \n\t"
+#endif
+
+
+ "vaddps %%xmm8 , %%xmm4 , %%xmm8 \n\t"
+
+ "vmovsd %%xmm8 , (%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap), // 4
+ "r" (alpha) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c
new file mode 100644
index 000000000..f1d50c909
--- /dev/null
+++ b/kernel/x86_64/daxpy.c
@@ -0,0 +1,105 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(NEHALEM)
+#include "daxpy_microk_nehalem-2.c"
+#elif defined(BULLDOZER)
+#include "daxpy_microk_bulldozer-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG register i = 0;
+ FLOAT a = *alpha;
+
+ while(i < n)
+ {
+ y[i] += a * x[i];
+ y[i+1] += a * x[i+1];
+ y[i+2] += a * x[i+2];
+ y[i+3] += a * x[i+3];
+ y[i+4] += a * x[i+4];
+ y[i+5] += a * x[i+5];
+ y[i+6] += a * x[i+6];
+ y[i+7] += a * x[i+7];
+ i+=8 ;
+
+ }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ int n1 = n & -8;
+
+ if ( n1 )
+ daxpy_kernel_8(n1, x, y , &da );
+
+ i = n1;
+ while(i < n)
+ {
+
+ y[i] += da * x[i] ;
+ i++ ;
+
+ }
+ return(0);
+
+
+ }
+
+ while(i < n)
+ {
+
+ y[iy] += da * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c
new file mode 100644
index 000000000..b1ef84a18
--- /dev/null
+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c
@@ -0,0 +1,82 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vmovddup (%4), %%xmm0 \n\t" // alpha
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "prefetcht0 768(%3,%0,8) \n\t"
+ "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
+ "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm12, %%xmm8 \n\t" // y += alpha * x
+ "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
+ ".align 2 \n\t"
+ "vmovups %%xmm8 , (%3,%0,8) \n\t"
+ "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm13, %%xmm9 \n\t" // y += alpha * x
+ ".align 2 \n\t"
+ "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
+ "vmovups %%xmm9 , 16(%3,%0,8) \n\t"
+ "prefetcht0 768(%2,%0,8) \n\t"
+ ".align 2 \n\t"
+ "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm14, %%xmm10 \n\t" // y += alpha * x
+ "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
+ "vmovups %%xmm10, 32(%3,%0,8) \n\t"
+ "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm15, %%xmm11 \n\t" // y += alpha * x
+ "vmovups %%xmm11, 48(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (alpha) // 4
+ : "cc",
+ "%xmm0",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c
new file mode 100644
index 000000000..32ed1857c
--- /dev/null
+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c
@@ -0,0 +1,91 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movsd (%4), %%xmm0 \n\t" // alpha
+ "shufpd $0, %%xmm0, %%xmm0 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ // "prefetcht0 192(%2,%0,8) \n\t"
+ // "prefetcht0 192(%3,%0,8) \n\t"
+
+ "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
+ "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
+ "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
+ "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
+ "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
+ "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
+ "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
+ "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
+
+ "mulpd %%xmm0 , %%xmm12 \n\t" // alpha * x
+ "mulpd %%xmm0 , %%xmm13 \n\t"
+ "mulpd %%xmm0 , %%xmm14 \n\t"
+ "mulpd %%xmm0 , %%xmm15 \n\t"
+
+ "addpd %%xmm12, %%xmm8 \n\t" // y += alpha *x
+ "addpd %%xmm13, %%xmm9 \n\t"
+ "addpd %%xmm14, %%xmm10 \n\t"
+ "addpd %%xmm15, %%xmm11 \n\t"
+
+ "movups %%xmm8 , (%3,%0,8) \n\t"
+ "movups %%xmm9 , 16(%3,%0,8) \n\t"
+ "movups %%xmm10, 32(%3,%0,8) \n\t"
+ "movups %%xmm11, 48(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (alpha) // 4
+ : "cc",
+ "%xmm0",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
new file mode 100644
index 000000000..b3aad438f
--- /dev/null
+++ b/kernel/x86_64/ddot.c
@@ -0,0 +1,110 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "ddot_microk_bulldozer-2.c"
+#elif defined(NEHALEM)
+#include "ddot_microk_nehalem-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+ BLASLONG register i = 0;
+ FLOAT dot = 0.0;
+
+ while(i < n)
+ {
+ dot += y[i] * x[i]
+ + y[i+1] * x[i+1]
+ + y[i+2] * x[i+2]
+ + y[i+3] * x[i+3]
+ + y[i+4] * x[i+4]
+ + y[i+5] * x[i+5]
+ + y[i+6] * x[i+6]
+ + y[i+7] * x[i+7] ;
+
+ i+=8 ;
+
+ }
+ *d += dot;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ FLOAT dot = 0.0 ;
+
+ if ( n <= 0 ) return(dot);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ int n1 = n & -8;
+
+ if ( n1 )
+ ddot_kernel_8(n1, x, y , &dot );
+
+
+ i = n1;
+ while(i < n)
+ {
+
+ dot += y[i] * x[i] ;
+ i++ ;
+
+ }
+ return(dot);
+
+
+ }
+
+ while(i < n)
+ {
+
+ dot += y[iy] * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(dot);
+
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c
index 19888d150..0c77b6349 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c
@@ -25,47 +25,45 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+#define HAVE_KERNEL_8 1
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
+
BLASLONG register i = 0;
__asm__ __volatile__
(
- "vzeroupper \n\t"
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
+ "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t"
+ "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t"
+ "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t"
+ "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
+ "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
+ "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
+ "vmovups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
+
+ "vfmaddpd %%xmm4, (%3,%0,8), %%xmm12, %%xmm4 \n\t" // 2 * y
+ "vfmaddpd %%xmm5, 16(%3,%0,8), %%xmm13, %%xmm5 \n\t" // 2 * y
+ "vfmaddpd %%xmm6, 32(%3,%0,8), %%xmm14, %%xmm6 \n\t" // 2 * y
+ "vfmaddpd %%xmm7, 48(%3,%0,8), %%xmm15, %%xmm7 \n\t" // 2 * y
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y
- "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
- "prefetcht0 192(%4,%0,4) \n\t"
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
- "prefetcht0 192(%5,%0,4) \n\t"
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
- "prefetcht0 192(%6,%0,4) \n\t"
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
- "prefetcht0 192(%7,%0,4) \n\t"
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
+ "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
+ "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"
+ "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t"
- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
+ "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t"
- "addq $16, %0 \n\t"
- "subq $16, %1 \n\t"
- "jnz .L01LOOP%= \n\t"
- "vzeroupper \n\t"
+ "vmovsd %%xmm4, (%4) \n\t"
:
:
@@ -73,12 +71,10 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
+ "r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c
new file mode 100644
index 000000000..dd05053f7
--- /dev/null
+++ b/kernel/x86_64/ddot_microk_nehalem-2.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "xorpd %%xmm4, %%xmm4 \n\t"
+ "xorpd %%xmm5, %%xmm5 \n\t"
+ "xorpd %%xmm6, %%xmm6 \n\t"
+ "xorpd %%xmm7, %%xmm7 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
+ "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
+ "movups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
+ "movups 16(%3,%0,8), %%xmm9 \n\t" // 2 * y
+ "movups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
+ "movups 32(%3,%0,8), %%xmm10 \n\t" // 2 * y
+ "movups 48(%2,%0,8), %%xmm15 \n\t" // 2 * x
+ "movups 48(%3,%0,8), %%xmm11 \n\t" // 2 * y
+
+ "mulpd %%xmm8 , %%xmm12 \n\t"
+ "mulpd %%xmm9 , %%xmm13 \n\t"
+ "mulpd %%xmm10, %%xmm14 \n\t"
+ "mulpd %%xmm11, %%xmm15 \n\t"
+
+ "addpd %%xmm12, %%xmm4 \n\t"
+ "addpd %%xmm13, %%xmm5 \n\t"
+ "addpd %%xmm14, %%xmm6 \n\t"
+ "addpd %%xmm15, %%xmm7 \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "addpd %%xmm5, %%xmm4 \n\t"
+ "addpd %%xmm7, %%xmm6 \n\t"
+ "addpd %%xmm6, %%xmm4 \n\t"
+
+ "haddpd %%xmm4, %%xmm4 \n\t"
+
+ "movsd %%xmm4, (%4) \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (dot) // 4
+ : "cc",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/dgemv_n.c b/kernel/x86_64/dgemv_n.c
deleted file mode 100644
index 5d826dc63..000000000
--- a/kernel/x86_64/dgemv_n.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-
-#if defined(HASWELL)
-#include "dgemv_n_microk_haswell-2.c"
-#endif
-
-
-#define NBMAX 2048
-
-#ifndef HAVE_KERNEL_16x4
-
-static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0,*a1,*a2,*a3;
- a0 = ap[0];
- a1 = ap[1];
- a2 = ap[2];
- a3 = ap[3];
-
- for ( i=0; i< n; i+=4 )
- {
- y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
- y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
- y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
- y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
- }
-}
-
-#endif
-
-static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0;
- a0 = ap;
-
- for ( i=0; i< n; i+=4 )
- {
- y[i] += a0[i]*x[0];
- y[i+1] += a0[i+1]*x[0];
- y[i+2] += a0[i+2]*x[0];
- y[i+3] += a0[i+3]*x[0];
- }
-}
-
-
-static void zero_y(BLASLONG n, FLOAT *dest)
-{
- BLASLONG i;
- for ( i=0; i<n; i++ )
- {
- *dest = 0.0;
- dest++;
- }
-}
-
-
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
- BLASLONG i;
- if ( inc_dest == 1 )
- {
- for ( i=0; i<n; i+=4 )
- {
- dest[i] += src[i];
- dest[i+1] += src[i+1];
- dest[i+2] += src[i+2];
- dest[i+3] += src[i+3];
- }
-
- }
- else
- {
- for ( i=0; i<n; i++ )
- {
- *dest += *src;
- src++;
- dest += inc_dest;
- }
- }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- FLOAT *ap[4];
- BLASLONG n1;
- BLASLONG m1;
- BLASLONG m2;
- BLASLONG n2;
- FLOAT xbuffer[4],*ybuffer;
-
- if ( m < 1 ) return(0);
- if ( n < 1 ) return(0);
-
- ybuffer = buffer;
-
- n1 = n / 4 ;
- n2 = n % 4 ;
-
- m1 = m - ( m % 16 );
- m2 = (m % NBMAX) - (m % 16) ;
-
- y_ptr = y;
-
- BLASLONG NB = NBMAX;
-
- while ( NB == NBMAX )
- {
-
- m1 -= NB;
- if ( m1 < 0)
- {
- if ( m2 == 0 ) break;
- NB = m2;
- }
-
- a_ptr = a;
- x_ptr = x;
- zero_y(NB,ybuffer);
- for( i = 0; i < n1 ; i++)
- {
- xbuffer[0] = alpha * x_ptr[0];
- x_ptr += inc_x;
- xbuffer[1] = alpha * x_ptr[0];
- x_ptr += inc_x;
- xbuffer[2] = alpha * x_ptr[0];
- x_ptr += inc_x;
- xbuffer[3] = alpha * x_ptr[0];
- x_ptr += inc_x;
- ap[0] = a_ptr;
- ap[1] = a_ptr + lda;
- ap[2] = ap[1] + lda;
- ap[3] = ap[2] + lda;
- dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
- a_ptr += 4 * lda;
- }
-
- for( i = 0; i < n2 ; i++)
- {
- xbuffer[0] = alpha * x_ptr[0];
- x_ptr += inc_x;
- dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
- a_ptr += 1 * lda;
-
- }
- add_y(NB,ybuffer,y_ptr,inc_y);
- a += NB;
- y_ptr += NB * inc_y;
- }
- j=0;
- while ( j < (m % 16))
- {
- a_ptr = a;
- x_ptr = x;
- FLOAT temp = 0.0;
- for( i = 0; i < n; i++ )
- {
- temp += a_ptr[0] * x_ptr[0];
- a_ptr += lda;
- x_ptr += inc_x;
- }
- y_ptr[0] += alpha * temp;
- y_ptr += inc_y;
- a++;
- j++;
- }
- return(0);
-}
-
-
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
new file mode 100644
index 000000000..371fd73ee
--- /dev/null
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -0,0 +1,548 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(NEHALEM)
+#include "dgemv_n_microk_nehalem-4.c"
+#elif defined(HASWELL)
+#include "dgemv_n_microk_haswell-4.c"
+#endif
+
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x8
+
+static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT *b0,*b1,*b2,*b3;
+ FLOAT *x4;
+ FLOAT x[8];
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+ b0 = a0 + lda4 ;
+ b1 = a1 + lda4 ;
+ b2 = a2 + lda4 ;
+ b3 = a3 + lda4 ;
+ x4 = x + 4;
+
+ for ( i=0; i<8; i++)
+ x[i] = xo[i] * *alpha;
+
+ for ( i=0; i< n; i+=4 )
+ {
+
+ y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
+ y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
+ y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
+ y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
+
+ y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
+ y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
+ y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
+ y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
+
+ }
+}
+
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT x[4];
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+
+ for ( i=0; i<4; i++)
+ x[i] = xo[i] * *alpha;
+
+ for ( i=0; i< n; i+=4 )
+ {
+ y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
+ y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
+ y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
+ y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
+ }
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movsd (%2) , %%xmm12 \n\t" // x0
+ "movsd (%6) , %%xmm4 \n\t" // alpha
+ "movsd 8(%2) , %%xmm13 \n\t" // x1
+ "mulsd %%xmm4 , %%xmm12 \n\t" // alpha
+ "mulsd %%xmm4 , %%xmm13 \n\t" // alpha
+ "shufpd $0, %%xmm12, %%xmm12 \n\t"
+ "shufpd $0, %%xmm13, %%xmm13 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
+ "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
+
+ "movups (%4,%0,8), %%xmm8 \n\t"
+ "movups (%5,%0,8), %%xmm9 \n\t"
+ "mulpd %%xmm12, %%xmm8 \n\t"
+ "mulpd %%xmm13, %%xmm9 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm4 \n\t"
+
+ "movups 16(%4,%0,8), %%xmm8 \n\t"
+ "movups 16(%5,%0,8), %%xmm9 \n\t"
+ "mulpd %%xmm12, %%xmm8 \n\t"
+ "mulpd %%xmm13, %%xmm9 \n\t"
+ "addpd %%xmm8 , %%xmm5 \n\t"
+ "addpd %%xmm9 , %%xmm5 \n\t"
+
+ "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
+ "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (alpha) // 6
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movsd (%2), %%xmm12 \n\t" // x0
+ "mulsd (%5), %%xmm12 \n\t" // alpha
+ "shufpd $0, %%xmm12, %%xmm12 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
+ "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
+ "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
+ "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y
+ "mulpd %%xmm12, %%xmm8 \n\t"
+ "mulpd %%xmm12, %%xmm9 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm5 \n\t"
+
+ "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y
+ "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap), // 4
+ "r" (alpha) // 5
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+#endif
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+ BLASLONG i;
+ if ( inc_dest != 1 )
+ {
+ for ( i=0; i<n; i++ )
+ {
+ *dest += *src;
+ src++;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ FLOAT *ap[4];
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ BLASLONG lda4 = lda << 2;
+ BLASLONG lda8 = lda << 3;
+ FLOAT xbuffer[8],*ybuffer;
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ ybuffer = buffer;
+
+ if ( inc_x == 1 )
+ {
+ n1 = n >> 3 ;
+ n2 = n & 7 ;
+ }
+ else
+ {
+ n1 = n >> 2 ;
+ n2 = n & 3 ;
+
+ }
+
+ m3 = m & 3 ;
+ m1 = m & -4 ;
+ m2 = (m & (NBMAX-1)) - m3 ;
+
+
+ y_ptr = y;
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ a_ptr = a;
+ x_ptr = x;
+
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+
+ if ( inc_y != 1 )
+ memset(ybuffer,0,NB*8);
+ else
+ ybuffer = y_ptr;
+
+ if ( inc_x == 1 )
+ {
+
+
+ for( i = 0; i < n1 ; i++)
+ {
+ dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+ ap[0] += lda8;
+ ap[1] += lda8;
+ ap[2] += lda8;
+ ap[3] += lda8;
+ a_ptr += lda8;
+ x_ptr += 8;
+ }
+
+
+ if ( n2 & 4 )
+ {
+ dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ a_ptr += lda4;
+ x_ptr += 4;
+ }
+
+ if ( n2 & 2 )
+ {
+ dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+ a_ptr += lda*2;
+ x_ptr += 2;
+ }
+
+
+ if ( n2 & 1 )
+ {
+ dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
+ a_ptr += lda;
+ x_ptr += 1;
+
+ }
+
+
+ }
+ else
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[1] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[2] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[3] = x_ptr[0];
+ x_ptr += inc_x;
+ dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ }
+
+ for( i = 0; i < n2 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ x_ptr += inc_x;
+ dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+ a_ptr += lda;
+
+ }
+
+ }
+
+ a += NB;
+ if ( inc_y != 1 )
+ {
+ add_y(NB,ybuffer,y_ptr,inc_y);
+ y_ptr += NB * inc_y;
+ }
+ else
+ y_ptr += NB ;
+
+ }
+
+ if ( m3 == 0 ) return(0);
+
+ if ( m3 == 3 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ FLOAT temp2 = 0.0;
+ if ( lda == 3 && inc_x ==1 )
+ {
+
+ for( i = 0; i < ( n & -4 ); i+=4 )
+ {
+
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+ temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+ temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
+ temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+ temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+ a_ptr += 12;
+ x_ptr += 4;
+ }
+
+ for( ; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ temp2 += a_ptr[2] * x_ptr[0];
+ a_ptr += 3;
+ x_ptr ++;
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ temp2 += a_ptr[2] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+
+
+ }
+
+ }
+ y_ptr[0] += alpha * temp0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp1;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp2;
+ return(0);
+ }
+
+
+ if ( m3 == 2 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ if ( lda == 2 && inc_x ==1 )
+ {
+
+ for( i = 0; i < (n & -4) ; i+=4 )
+ {
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+ temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+ a_ptr += 8;
+ x_ptr += 4;
+
+ }
+
+
+ for( ; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ a_ptr += 2;
+ x_ptr ++;
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+
+
+ }
+
+ }
+ y_ptr[0] += alpha * temp0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp1;
+ return(0);
+ }
+
+ if ( m3 == 1 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp = 0.0;
+ if ( lda == 1 && inc_x ==1 )
+ {
+
+ for( i = 0; i < (n & -4); i+=4 )
+ {
+ temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+
+ }
+
+ for( ; i < n; i++ )
+ {
+ temp += a_ptr[i] * x_ptr[i];
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp += a_ptr[0] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+ }
+ y_ptr[0] += alpha * temp;
+ return(0);
+ }
+
+
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
new file mode 100644
index 000000000..2c77f3469
--- /dev/null
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
+ "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
+ "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
+ "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
+ "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
+
+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz .L8LABEL%= \n\t"
+
+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
+ "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
+
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+
+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
+
+ "addq $4 , %8 \n\t"
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L8LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
+ "addq $8 , %0 \n\t"
+ "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
+ "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
+ "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
+
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "addq $8 , %8 \n\t"
+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
+ "subq $8 , %1 \n\t"
+ "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
+
+ "jnz .L01LOOP%= \n\t"
+
+ ".L16END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (lda4), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
+
+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz .L8LABEL%= \n\t"
+
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L8LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L8END%= \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L8END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/dgemv_n_microk_nehalem-4.c b/kernel/x86_64/dgemv_n_microk_nehalem-4.c
new file mode 100644
index 000000000..e311326f1
--- /dev/null
+++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c
@@ -0,0 +1,265 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movsd (%2), %%xmm12 \n\t" // x0
+ "movsd 8(%2), %%xmm13 \n\t" // x1
+ "movsd 16(%2), %%xmm14 \n\t" // x2
+ "movsd 24(%2), %%xmm15 \n\t" // x3
+ "shufpd $0, %%xmm12, %%xmm12\n\t"
+ "shufpd $0, %%xmm13, %%xmm13\n\t"
+ "shufpd $0, %%xmm14, %%xmm14\n\t"
+ "shufpd $0, %%xmm15, %%xmm15\n\t"
+
+ "movsd 32(%2), %%xmm0 \n\t" // x4
+ "movsd 40(%2), %%xmm1 \n\t" // x5
+ "movsd 48(%2), %%xmm2 \n\t" // x6
+ "movsd 56(%2), %%xmm3 \n\t" // x7
+ "shufpd $0, %%xmm0 , %%xmm0 \n\t"
+ "shufpd $0, %%xmm1 , %%xmm1 \n\t"
+ "shufpd $0, %%xmm2 , %%xmm2 \n\t"
+ "shufpd $0, %%xmm3 , %%xmm3 \n\t"
+
+ "movsd (%9), %%xmm6 \n\t" // alpha
+ "shufpd $0, %%xmm6 , %%xmm6 \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "xorpd %%xmm4 , %%xmm4 \n\t"
+ "xorpd %%xmm5 , %%xmm5 \n\t"
+ "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
+
+ ".align 2 \n\t"
+ "movups (%4,%0,8), %%xmm8 \n\t"
+ "movups (%5,%0,8), %%xmm9 \n\t"
+ "movups (%6,%0,8), %%xmm10 \n\t"
+ "movups (%7,%0,8), %%xmm11 \n\t"
+ ".align 2 \n\t"
+ "mulpd %%xmm12, %%xmm8 \n\t"
+ "mulpd %%xmm13, %%xmm9 \n\t"
+ "mulpd %%xmm14, %%xmm10 \n\t"
+ "mulpd %%xmm15, %%xmm11 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm5 \n\t"
+ "addpd %%xmm10, %%xmm4 \n\t"
+ "addpd %%xmm11, %%xmm5 \n\t"
+
+ "movups (%4,%8,8), %%xmm8 \n\t"
+ "movups (%5,%8,8), %%xmm9 \n\t"
+ "movups (%6,%8,8), %%xmm10 \n\t"
+ "movups (%7,%8,8), %%xmm11 \n\t"
+ ".align 2 \n\t"
+ "mulpd %%xmm0 , %%xmm8 \n\t"
+ "mulpd %%xmm1 , %%xmm9 \n\t"
+ "mulpd %%xmm2 , %%xmm10 \n\t"
+ "mulpd %%xmm3 , %%xmm11 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm5 \n\t"
+ "addpd %%xmm10, %%xmm4 \n\t"
+ "addpd %%xmm11, %%xmm5 \n\t"
+
+ "addpd %%xmm5 , %%xmm4 \n\t"
+ "mulpd %%xmm6 , %%xmm4 \n\t"
+ "addpd %%xmm4 , %%xmm7 \n\t"
+
+ "movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y
+
+ "xorpd %%xmm4 , %%xmm4 \n\t"
+ "xorpd %%xmm5 , %%xmm5 \n\t"
+ "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
+
+ ".align 2 \n\t"
+ "movups 16(%4,%0,8), %%xmm8 \n\t"
+ "movups 16(%5,%0,8), %%xmm9 \n\t"
+ "movups 16(%6,%0,8), %%xmm10 \n\t"
+ "movups 16(%7,%0,8), %%xmm11 \n\t"
+ ".align 2 \n\t"
+ "mulpd %%xmm12, %%xmm8 \n\t"
+ "mulpd %%xmm13, %%xmm9 \n\t"
+ "mulpd %%xmm14, %%xmm10 \n\t"
+ "mulpd %%xmm15, %%xmm11 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm5 \n\t"
+ "addpd %%xmm10, %%xmm4 \n\t"
+ "addpd %%xmm11, %%xmm5 \n\t"
+
+ "movups 16(%4,%8,8), %%xmm8 \n\t"
+ "movups 16(%5,%8,8), %%xmm9 \n\t"
+ "movups 16(%6,%8,8), %%xmm10 \n\t"
+ "movups 16(%7,%8,8), %%xmm11 \n\t"
+ ".align 2 \n\t"
+ "mulpd %%xmm0 , %%xmm8 \n\t"
+ "mulpd %%xmm1 , %%xmm9 \n\t"
+ "mulpd %%xmm2 , %%xmm10 \n\t"
+ "mulpd %%xmm3 , %%xmm11 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm5 \n\t"
+ "addpd %%xmm10, %%xmm4 \n\t"
+ "addpd %%xmm11, %%xmm5 \n\t"
+
+ "addq $4 , %8 \n\t"
+ "addpd %%xmm5 , %%xmm4 \n\t"
+ "mulpd %%xmm6 , %%xmm4 \n\t"
+ "addpd %%xmm4 , %%xmm7 \n\t"
+
+ "movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (lda4), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movsd (%2), %%xmm12 \n\t" // x0
+ "movsd 8(%2), %%xmm13 \n\t" // x1
+ "movsd 16(%2), %%xmm14 \n\t" // x2
+ "movsd 24(%2), %%xmm15 \n\t" // x3
+ "shufpd $0, %%xmm12, %%xmm12\n\t"
+ "shufpd $0, %%xmm13, %%xmm13\n\t"
+ "shufpd $0, %%xmm14, %%xmm14\n\t"
+ "shufpd $0, %%xmm15, %%xmm15\n\t"
+
+ "movsd (%8), %%xmm6 \n\t" // alpha
+ "shufpd $0, %%xmm6 , %%xmm6 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "xorpd %%xmm4 , %%xmm4 \n\t"
+ "xorpd %%xmm5 , %%xmm5 \n\t"
+ "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
+
+ "movups (%4,%0,8), %%xmm8 \n\t"
+ "movups (%5,%0,8), %%xmm9 \n\t"
+ "movups (%6,%0,8), %%xmm10 \n\t"
+ "movups (%7,%0,8), %%xmm11 \n\t"
+ "mulpd %%xmm12, %%xmm8 \n\t"
+ "mulpd %%xmm13, %%xmm9 \n\t"
+ "mulpd %%xmm14, %%xmm10 \n\t"
+ "mulpd %%xmm15, %%xmm11 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm4 \n\t"
+ "addpd %%xmm10 , %%xmm4 \n\t"
+ "addpd %%xmm4 , %%xmm11 \n\t"
+
+ "mulpd %%xmm6 , %%xmm11 \n\t"
+ "addpd %%xmm7 , %%xmm11 \n\t"
+ "movups %%xmm11, (%3,%0,8) \n\t" // 2 * y
+
+ "xorpd %%xmm4 , %%xmm4 \n\t"
+ "xorpd %%xmm5 , %%xmm5 \n\t"
+ "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y
+
+ "movups 16(%4,%0,8), %%xmm8 \n\t"
+ "movups 16(%5,%0,8), %%xmm9 \n\t"
+ "movups 16(%6,%0,8), %%xmm10 \n\t"
+ "movups 16(%7,%0,8), %%xmm11 \n\t"
+ "mulpd %%xmm12, %%xmm8 \n\t"
+ "mulpd %%xmm13, %%xmm9 \n\t"
+ "mulpd %%xmm14, %%xmm10 \n\t"
+ "mulpd %%xmm15, %%xmm11 \n\t"
+ "addpd %%xmm8 , %%xmm4 \n\t"
+ "addpd %%xmm9 , %%xmm4 \n\t"
+ "addpd %%xmm10 , %%xmm4 \n\t"
+ "addpd %%xmm4 , %%xmm11 \n\t"
+
+ "mulpd %%xmm6 , %%xmm11 \n\t"
+ "addpd %%xmm7 , %%xmm11 \n\t"
+ "movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/dgemv_t.c b/kernel/x86_64/dgemv_t.c
deleted file mode 100644
index 0fa8378fe..000000000
--- a/kernel/x86_64/dgemv_t.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(HASWELL)
-#include "dgemv_t_microk_haswell-2.c"
-#endif
-
-#define NBMAX 2048
-
-#ifndef HAVE_KERNEL_16x4
-
-static void dgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0,*a1,*a2,*a3;
- a0 = ap[0];
- a1 = ap[1];
- a2 = ap[2];
- a3 = ap[3];
- FLOAT temp0 = 0.0;
- FLOAT temp1 = 0.0;
- FLOAT temp2 = 0.0;
- FLOAT temp3 = 0.0;
-
- for ( i=0; i< n; i+=4 )
- {
- temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
- temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
- temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
- temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
- }
- y[0] = temp0;
- y[1] = temp1;
- y[2] = temp2;
- y[3] = temp3;
-}
-
-#endif
-
-static void dgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0;
- a0 = ap;
- FLOAT temp = 0.0;
-
- for ( i=0; i< n; i+=4 )
- {
- temp += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
- }
- *y = temp;
-}
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
- BLASLONG i;
- for ( i=0; i<n; i++ )
- {
- *dest = *src;
- dest++;
- src += inc_src;
- }
-}
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- FLOAT *ap[4];
- BLASLONG n1;
- BLASLONG m1;
- BLASLONG m2;
- BLASLONG n2;
- FLOAT ybuffer[4],*xbuffer;
-
- if ( m < 1 ) return(0);
- if ( n < 1 ) return(0);
-
- xbuffer = buffer;
-
- n1 = n / 4 ;
- n2 = n % 4 ;
-
- m1 = m - ( m % 16 );
- m2 = (m % NBMAX) - (m % 16) ;
-
-
- BLASLONG NB = NBMAX;
-
- while ( NB == NBMAX )
- {
-
- m1 -= NB;
- if ( m1 < 0)
- {
- if ( m2 == 0 ) break;
- NB = m2;
- }
-
- y_ptr = y;
- a_ptr = a;
- x_ptr = x;
- copy_x(NB,x_ptr,xbuffer,inc_x);
- for( i = 0; i < n1 ; i++)
- {
- ap[0] = a_ptr;
- ap[1] = a_ptr + lda;
- ap[2] = ap[1] + lda;
- ap[3] = ap[2] + lda;
- dgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
- a_ptr += 4 * lda;
- *y_ptr += ybuffer[0]*alpha;
- y_ptr += inc_y;
- *y_ptr += ybuffer[1]*alpha;
- y_ptr += inc_y;
- *y_ptr += ybuffer[2]*alpha;
- y_ptr += inc_y;
- *y_ptr += ybuffer[3]*alpha;
- y_ptr += inc_y;
- }
-
- for( i = 0; i < n2 ; i++)
- {
- dgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
- a_ptr += 1 * lda;
- *y_ptr += ybuffer[0]*alpha;
- y_ptr += inc_y;
-
- }
- a += NB;
- x += NB * inc_x;
- }
-
- BLASLONG m3 = m % 16;
- if ( m3 == 0 ) return(0);
- x_ptr = x;
- for ( i=0; i< m3; i++ )
- {
- xbuffer[i] = *x_ptr;
- x_ptr += inc_x;
- }
- j=0;
- a_ptr = a;
- y_ptr = y;
- while ( j < n)
- {
- FLOAT temp = 0.0;
- for( i = 0; i < m3; i++ )
- {
- temp += a_ptr[i] * xbuffer[i];
- }
- a_ptr += lda;
- y_ptr[0] += alpha * temp;
- y_ptr += inc_y;
- j++;
- }
- return(0);
-}
-
-
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
new file mode 100644
index 000000000..ebec7d2c3
--- /dev/null
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -0,0 +1,615 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(HASWELL)
+#include "dgemv_t_microk_haswell-4.c"
+#endif
+
+#define NBMAX 2048
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ FLOAT temp2 = 0.0;
+ FLOAT temp3 = 0.0;
+
+ for ( i=0; i< n; i+=4 )
+ {
+ temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
+ temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
+ temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
+ temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
+ }
+ y[0] = temp0;
+ y[1] = temp1;
+ y[2] = temp2;
+ y[3] = temp3;
+}
+
+#endif
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+ "xorpd %%xmm10 , %%xmm10 \n\t"
+ "xorpd %%xmm11 , %%xmm11 \n\t"
+
+ "testq $2 , %1 \n\t"
+ "jz .L01LABEL%= \n\t"
+
+ "movups (%5,%0,8) , %%xmm14 \n\t" // x
+ "movups (%3,%0,8) , %%xmm12 \n\t" // ap0
+ "movups (%4,%0,8) , %%xmm13 \n\t" // ap1
+ "mulpd %%xmm14 , %%xmm12 \n\t"
+ "mulpd %%xmm14 , %%xmm13 \n\t"
+ "addq $2 , %0 \n\t"
+ "addpd %%xmm12 , %%xmm10 \n\t"
+ "subq $2 , %1 \n\t"
+ "addpd %%xmm13 , %%xmm11 \n\t"
+
+ ".L01LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%5,%0,8) , %%xmm14 \n\t" // x
+ "movups (%3,%0,8) , %%xmm12 \n\t" // ap0
+ "movups (%4,%0,8) , %%xmm13 \n\t" // ap1
+ "mulpd %%xmm14 , %%xmm12 \n\t"
+ "mulpd %%xmm14 , %%xmm13 \n\t"
+ "addpd %%xmm12 , %%xmm10 \n\t"
+ "addpd %%xmm13 , %%xmm11 \n\t"
+
+ "movups 16(%5,%0,8) , %%xmm14 \n\t" // x
+ "movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0
+ "movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1
+ "mulpd %%xmm14 , %%xmm12 \n\t"
+ "mulpd %%xmm14 , %%xmm13 \n\t"
+ "addpd %%xmm12 , %%xmm10 \n\t"
+ "addpd %%xmm13 , %%xmm11 \n\t"
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "haddpd %%xmm10, %%xmm10 \n\t"
+ "haddpd %%xmm11, %%xmm11 \n\t"
+
+ "movsd %%xmm10, (%2) \n\t"
+ "movsd %%xmm11,8(%2) \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (y), // 2
+ "r" (ap0), // 3
+ "r" (ap1), // 4
+ "r" (x) // 5
+ : "cc",
+ "%xmm4", "%xmm5", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+
+}
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+ "xorpd %%xmm9 , %%xmm9 \n\t"
+ "xorpd %%xmm10 , %%xmm10 \n\t"
+
+ "testq $2 , %1 \n\t"
+ "jz .L01LABEL%= \n\t"
+
+ "movups (%3,%0,8) , %%xmm12 \n\t"
+ "movups (%4,%0,8) , %%xmm11 \n\t"
+ "mulpd %%xmm11 , %%xmm12 \n\t"
+ "addq $2 , %0 \n\t"
+ "addpd %%xmm12 , %%xmm10 \n\t"
+ "subq $2 , %1 \n\t"
+
+ ".L01LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%3,%0,8) , %%xmm12 \n\t"
+ "movups 16(%3,%0,8) , %%xmm14 \n\t"
+ "movups (%4,%0,8) , %%xmm11 \n\t"
+ "movups 16(%4,%0,8) , %%xmm13 \n\t"
+ "mulpd %%xmm11 , %%xmm12 \n\t"
+ "mulpd %%xmm13 , %%xmm14 \n\t"
+ "addq $4 , %0 \n\t"
+ "addpd %%xmm12 , %%xmm10 \n\t"
+ "subq $4 , %1 \n\t"
+ "addpd %%xmm14 , %%xmm9 \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "addpd %%xmm9 , %%xmm10 \n\t"
+ "haddpd %%xmm10, %%xmm10 \n\t"
+
+ "movsd %%xmm10, (%2) \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (y), // 2
+ "r" (ap), // 3
+ "r" (x) // 4
+ : "cc",
+ "%xmm9", "%xmm10" ,
+ "%xmm11", "%xmm12", "%xmm13", "%xmm14",
+ "memory"
+ );
+
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+ BLASLONG i;
+ for ( i=0; i<n; i++ )
+ {
+ *dest = *src;
+ dest++;
+ src += inc_src;
+ }
+}
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+
+ BLASLONG i;
+
+ if ( inc_dest != 1 )
+ {
+ for ( i=0; i<n; i++ )
+ {
+ *dest += src[i] * da;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+ "movsd (%2) , %%xmm10 \n\t"
+ "shufpd $0 , %%xmm10 , %%xmm10 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%3,%0,8) , %%xmm12 \n\t"
+ "movups (%4,%0,8) , %%xmm11 \n\t"
+ "mulpd %%xmm10 , %%xmm12 \n\t"
+ "addq $2 , %0 \n\t"
+ "addpd %%xmm12 , %%xmm11 \n\t"
+ "subq $2 , %1 \n\t"
+ "movups %%xmm11, -16(%4,%0,8) \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (&da), // 2
+ "r" (src), // 3
+ "r" (dest) // 4
+ : "cc",
+ "%xmm10", "%xmm11", "%xmm12",
+ "memory"
+ );
+
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG register i;
+ BLASLONG register j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ BLASLONG n0;
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ FLOAT ybuffer[4],*xbuffer;
+ FLOAT *ytemp;
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ xbuffer = buffer;
+ ytemp = buffer + NBMAX;
+
+ n0 = n / NBMAX;
+ n1 = (n % NBMAX) >> 2 ;
+ n2 = n & 3 ;
+
+ m3 = m & 3 ;
+ m1 = m & -4 ;
+ m2 = (m & (NBMAX-1)) - m3 ;
+
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ y_ptr = y;
+ a_ptr = a;
+ x_ptr = x;
+
+ if ( inc_x == 1 )
+ xbuffer = x_ptr;
+ else
+ copy_x(NB,x_ptr,xbuffer,inc_x);
+
+
+ FLOAT *ap[4];
+ FLOAT *yp;
+ BLASLONG register lda4 = 4 * lda;
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+
+ if ( n0 > 0 )
+ {
+ BLASLONG nb1 = NBMAX / 4;
+ for( j=0; j<n0; j++)
+ {
+
+ yp = ytemp;
+ for( i = 0; i < nb1 ; i++)
+ {
+ dgemv_kernel_4x4(NB,ap,xbuffer,yp);
+ ap[0] += lda4 ;
+ ap[1] += lda4 ;
+ ap[2] += lda4 ;
+ ap[3] += lda4 ;
+ yp += 4;
+ }
+ add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
+ y_ptr += nb1 * inc_y * 4;
+ a_ptr += nb1 * lda4 ;
+
+ }
+
+ }
+
+
+ yp = ytemp;
+
+ for( i = 0; i < n1 ; i++)
+ {
+ dgemv_kernel_4x4(NB,ap,xbuffer,yp);
+ ap[0] += lda4 ;
+ ap[1] += lda4 ;
+ ap[2] += lda4 ;
+ ap[3] += lda4 ;
+ yp += 4;
+ }
+ if ( n1 > 0 )
+ {
+ add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
+ y_ptr += n1 * inc_y * 4;
+ a_ptr += n1 * lda4 ;
+ }
+
+ if ( n2 & 2 )
+ {
+
+ dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
+ a_ptr += lda * 2;
+ *y_ptr += ybuffer[0] * alpha;
+ y_ptr += inc_y;
+ *y_ptr += ybuffer[1] * alpha;
+ y_ptr += inc_y;
+
+ }
+
+ if ( n2 & 1 )
+ {
+
+ dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+ a_ptr += lda;
+ *y_ptr += ybuffer[0] * alpha;
+ y_ptr += inc_y;
+
+ }
+ a += NB;
+ x += NB * inc_x;
+ }
+
+ if ( m3 == 0 ) return(0);
+
+ x_ptr = x;
+ a_ptr = a;
+ if ( m3 == 3 )
+ {
+ FLOAT xtemp0 = *x_ptr * alpha;
+ x_ptr += inc_x;
+ FLOAT xtemp1 = *x_ptr * alpha;
+ x_ptr += inc_x;
+ FLOAT xtemp2 = *x_ptr * alpha;
+
+ FLOAT *aj = a_ptr;
+ y_ptr = y;
+
+ if ( lda == 3 && inc_y == 1 )
+ {
+
+ for ( j=0; j< ( n & -4) ; j+=4 )
+ {
+
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+ y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+ y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+ y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+ aj += 12;
+ }
+
+ for ( ; j<n; j++ )
+ {
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+ aj += 3;
+ }
+
+ }
+ else
+ {
+
+ if ( inc_y == 1 )
+ {
+
+ BLASLONG register lda2 = lda << 1;
+ BLASLONG register lda4 = lda << 2;
+ BLASLONG register lda3 = lda2 + lda;
+
+ for ( j=0; j< ( n & -4 ); j+=4 )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+ y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
+ y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
+ y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
+ aj += lda4;
+ }
+
+ for ( ; j< n ; j++ )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
+ aj += lda;
+ }
+
+ }
+ else
+ {
+
+ for ( j=0; j<n; j++ )
+ {
+ *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+ y_ptr += inc_y;
+ aj += lda;
+ }
+
+
+ }
+
+ }
+ return(0);
+ }
+
+ if ( m3 == 2 )
+ {
+ FLOAT xtemp0 = *x_ptr * alpha;
+ x_ptr += inc_x;
+ FLOAT xtemp1 = *x_ptr * alpha;
+
+ FLOAT *aj = a_ptr;
+ y_ptr = y;
+
+ if ( lda == 2 && inc_y == 1 )
+ {
+
+ for ( j=0; j< ( n & -4) ; j+=4 )
+ {
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+ y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
+ y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
+ y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
+ aj += 8;
+
+ }
+
+ for ( ; j<n; j++ )
+ {
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+ aj += 2;
+ }
+
+ }
+ else
+ {
+ if ( inc_y == 1 )
+ {
+
+ BLASLONG register lda2 = lda << 1;
+ BLASLONG register lda4 = lda << 2;
+ BLASLONG register lda3 = lda2 + lda;
+
+ for ( j=0; j< ( n & -4 ); j+=4 )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+ y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
+ y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
+ y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
+ aj += lda4;
+ }
+
+ for ( ; j< n ; j++ )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+ aj += lda;
+ }
+
+ }
+ else
+ {
+ for ( j=0; j<n; j++ )
+ {
+ *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+ y_ptr += inc_y;
+ aj += lda;
+ }
+ }
+
+ }
+ return(0);
+
+ }
+
+ FLOAT xtemp = *x_ptr * alpha;
+ FLOAT *aj = a_ptr;
+ y_ptr = y;
+ if ( lda == 1 && inc_y == 1 )
+ {
+ for ( j=0; j< ( n & -4) ; j+=4 )
+ {
+ y_ptr[j] += aj[j] * xtemp;
+ y_ptr[j+1] += aj[j+1] * xtemp;
+ y_ptr[j+2] += aj[j+2] * xtemp;
+ y_ptr[j+3] += aj[j+3] * xtemp;
+ }
+ for ( ; j<n ; j++ )
+ {
+ y_ptr[j] += aj[j] * xtemp;
+ }
+
+
+
+ }
+ else
+ {
+ if ( inc_y == 1 )
+ {
+
+ BLASLONG register lda2 = lda << 1;
+ BLASLONG register lda4 = lda << 2;
+ BLASLONG register lda3 = lda2 + lda;
+ for ( j=0; j< ( n & -4 ); j+=4 )
+ {
+ y_ptr[j] += *aj * xtemp;
+ y_ptr[j+1] += *(aj+lda) * xtemp;
+ y_ptr[j+2] += *(aj+lda2) * xtemp;
+ y_ptr[j+3] += *(aj+lda3) * xtemp;
+ aj += lda4 ;
+ }
+
+ for ( ; j<n; j++ )
+ {
+ y_ptr[j] += *aj * xtemp;
+ aj += lda;
+ }
+
+ }
+ else
+ {
+ for ( j=0; j<n; j++ )
+ {
+ *y_ptr += *aj * xtemp;
+ y_ptr += inc_y;
+ aj += lda;
+ }
+
+ }
+ }
+
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/dgemv_t_microk_haswell-2.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c
index 1a4ba37d7..33b43515d 100644
--- a/kernel/x86_64/dgemv_t_microk_haswell-2.c
+++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c
@@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_16x4 1
-static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
@@ -41,29 +41,49 @@ static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorpd %%ymm6 , %%ymm6, %%ymm6 \n\t"
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
".align 16 \n\t"
".L01LOOP%=: \n\t"
- "prefetcht0 384(%2,%0,8) \n\t"
+ // "prefetcht0 384(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
- "prefetcht0 384(%4,%0,8) \n\t"
+ // "prefetcht0 384(%4,%0,8) \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
- "prefetcht0 384(%5,%0,8) \n\t"
- "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
- "prefetcht0 384(%6,%0,8) \n\t"
+ // "prefetcht0 384(%5,%0,8) \n\t"
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
- "prefetcht0 384(%7,%0,8) \n\t"
- "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
- "vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
+ // "prefetcht0 384(%6,%0,8) \n\t"
+ "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "addq $8 , %0 \n\t"
+ // "prefetcht0 384(%7,%0,8) \n\t"
+ "vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
+ "subq $8 , %1 \n\t"
+ "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
- "addq $8 , %0 \n\t"
- "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
+ ".L16END%=: \n\t"
+
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c
new file mode 100644
index 000000000..8d1337746
--- /dev/null
+++ b/kernel/x86_64/dsymv_L.c
@@ -0,0 +1,299 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(BULLDOZER)
+#include "dsymv_L_microk_bulldozer-2.c"
+#elif defined(NEHALEM)
+#include "dsymv_L_microk_nehalem-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2)
+{
+ FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
+ BLASLONG i;
+
+ for (i=from; i<to; i+=4)
+ {
+
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+ y[i] += tmp1[3] * ap[3][i];
+ tmp2[3] += ap[3][i] * x[i];
+
+ y[i+1] += tmp1[0] * ap[0][i+1];
+ tmp2[0] += ap[0][i+1] * x[i+1];
+ y[i+1] += tmp1[1] * ap[1][i+1];
+ tmp2[1] += ap[1][i+1] * x[i+1];
+ y[i+1] += tmp1[2] * ap[2][i+1];
+ tmp2[2] += ap[2][i+1] * x[i+1];
+ y[i+1] += tmp1[3] * ap[3][i+1];
+ tmp2[3] += ap[3][i+1] * x[i+1];
+
+ y[i+2] += tmp1[0] * ap[0][i+2];
+ tmp2[0] += ap[0][i+2] * x[i+2];
+ y[i+2] += tmp1[1] * ap[1][i+2];
+ tmp2[1] += ap[1][i+2] * x[i+2];
+ y[i+2] += tmp1[2] * ap[2][i+2];
+ tmp2[2] += ap[2][i+2] * x[i+2];
+ y[i+2] += tmp1[3] * ap[3][i+2];
+ tmp2[3] += ap[3][i+2] * x[i+2];
+
+ y[i+3] += tmp1[0] * ap[0][i+3];
+ tmp2[0] += ap[0][i+3] * x[i+3];
+ y[i+3] += tmp1[1] * ap[1][i+3];
+ tmp2[1] += ap[1][i+3] * x[i+3];
+ y[i+3] += tmp1[2] * ap[2][i+3];
+ tmp2[2] += ap[2][i+3] * x[i+3];
+ y[i+3] += tmp1[3] * ap[3][i+3];
+ tmp2[3] += ap[3][i+3] * x[i+3];
+
+ }
+
+ temp2[0] += tmp2[0];
+ temp2[1] += tmp2[1];
+ temp2[2] += tmp2[2];
+ temp2[3] += tmp2[3];
+}
+
+#endif
+
+
+
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG jx,jy;
+ BLASLONG j;
+ FLOAT temp1;
+ FLOAT temp2;
+ FLOAT tmp1[4];
+ FLOAT tmp2[4];
+ FLOAT *ap[4];
+
+#if 0
+ if ( m != offset )
+ printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+
+ if ( (inc_x != 1) || (inc_y != 1) )
+ {
+
+ jx = 0;
+ jy = 0;
+
+ for (j=0; j<offset; j++)
+ {
+ temp1 = alpha * x[jx];
+ temp2 = 0.0;
+ y[jy] += temp1 * a[j*lda+j];
+ iy = jy;
+ ix = jx;
+ for (i=j+1; i<m; i++)
+ {
+ ix += inc_x;
+ iy += inc_y;
+ y[iy] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[ix];
+
+ }
+ y[jy] += alpha * temp2;
+ jx += inc_x;
+ jy += inc_y;
+ }
+ return(0);
+ }
+
+ BLASLONG offset1 = (offset/4)*4;
+
+ for (j=0; j<offset1; j+=4)
+ {
+ tmp1[0] = alpha * x[j];
+ tmp1[1] = alpha * x[j+1];
+ tmp1[2] = alpha * x[j+2];
+ tmp1[3] = alpha * x[j+3];
+ tmp2[0] = 0.0;
+ tmp2[1] = 0.0;
+ tmp2[2] = 0.0;
+ tmp2[3] = 0.0;
+ ap[0] = &a[j*lda];
+ ap[1] = ap[0] + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+ y[j] += tmp1[0] * ap[0][j];
+ y[j+1] += tmp1[1] * ap[1][j+1];
+ y[j+2] += tmp1[2] * ap[2][j+2];
+ y[j+3] += tmp1[3] * ap[3][j+3];
+ BLASLONG from = j+1;
+ if ( m - from >=12 )
+ {
+ BLASLONG m2 = (m/4)*4;
+ for (i=j+1; i<j+4; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+ }
+
+ for (i=j+2; i<j+4; i++)
+ {
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+ }
+
+ for (i=j+3; i<j+4; i++)
+ {
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+ }
+
+ if ( m2 > j+4 )
+ dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);
+
+
+ for (i=m2; i<m; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+
+ y[i] += tmp1[3] * ap[3][i];
+ tmp2[3] += ap[3][i] * x[i];
+
+ }
+
+
+ }
+ else
+ {
+
+ for (i=j+1; i<j+4; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+ }
+
+ for (i=j+2; i<j+4; i++)
+ {
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+ }
+
+ for (i=j+3; i<j+4; i++)
+ {
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+ }
+
+ for (i=j+4; i<m; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+
+ y[i] += tmp1[3] * ap[3][i];
+ tmp2[3] += ap[3][i] * x[i];
+
+ }
+
+ }
+ y[j] += alpha * tmp2[0];
+ y[j+1] += alpha * tmp2[1];
+ y[j+2] += alpha * tmp2[2];
+ y[j+3] += alpha * tmp2[3];
+ }
+
+
+ for (j=offset1; j<offset; j++)
+ {
+ temp1 = alpha * x[j];
+ temp2 = 0.0;
+ y[j] += temp1 * a[j*lda+j];
+ BLASLONG from = j+1;
+ if ( m - from >=8 )
+ {
+ BLASLONG j1 = ((from + 4)/4)*4;
+ BLASLONG j2 = (m/4)*4;
+ for (i=from; i<j1; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ for (i=j1; i<j2; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ for (i=j2; i<m; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ }
+ else
+ {
+ for (i=from; i<m; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ }
+ y[j] += alpha * temp2;
+ }
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
new file mode 100644
index 000000000..70d8df36b
--- /dev/null
+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+
+ __asm__ __volatile__
+ (
+ "vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "vmovddup (%8), %%xmm4 \n\t" // temp1[0]
+ "vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
+ "vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
+ "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
+ "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
+ "vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
+
+ "vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
+
+ "vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
+
+ "vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
+
+ "vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
+ "vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+ "vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
+ "vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
+
+ "vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
+
+ "vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
+ "vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
+
+ "vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
+
+ "vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
+ "vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
+
+ "vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
+
+ "vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
+ "addq $4 , %0 \n\t"
+
+ "vmovups %%xmm9 , -32(%3,%0,8) \n\t"
+ "vmovups %%xmm11 , -16(%3,%0,8) \n\t"
+
+ "cmpq %0 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovsd (%9), %%xmm4 \n\t"
+ "vmovsd 8(%9), %%xmm5 \n\t"
+ "vmovsd 16(%9), %%xmm6 \n\t"
+ "vmovsd 24(%9), %%xmm7 \n\t"
+
+ "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
+ "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
+ "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
+ "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
+
+ "vaddsd %%xmm4, %%xmm0, %%xmm0 \n\t"
+ "vaddsd %%xmm5, %%xmm1, %%xmm1 \n\t"
+ "vaddsd %%xmm6, %%xmm2, %%xmm2 \n\t"
+ "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t"
+
+ "vmovsd %%xmm0 , (%9) \n\t" // save temp2
+ "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
+ "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
+ "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (from), // 0
+ "r" (to), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a[0]), // 4
+ "r" (a[1]), // 5
+ "r" (a[2]), // 6
+ "r" (a[3]), // 8
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
new file mode 100644
index 000000000..3ba596c5e
--- /dev/null
+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
@@ -0,0 +1,132 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void dsymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+
+ __asm__ __volatile__
+ (
+ "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "movsd (%8), %%xmm4 \n\t" // temp1[0]
+ "movsd 8(%8), %%xmm5 \n\t" // temp1[1]
+ "movsd 16(%8), %%xmm6 \n\t" // temp1[2]
+ "movsd 24(%8), %%xmm7 \n\t" // temp1[3]
+ "shufpd $0, %%xmm4, %%xmm4 \n\t"
+ "shufpd $0, %%xmm5, %%xmm5 \n\t"
+ "shufpd $0, %%xmm6, %%xmm6 \n\t"
+ "shufpd $0, %%xmm7, %%xmm7 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
+ "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
+ "movups %%xmm12 , %%xmm11 \n\t"
+ "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
+ "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
+
+ "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm12 \n\t" // a * x
+ "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+
+ "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
+ "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
+
+ "movups %%xmm13 , %%xmm11 \n\t"
+ "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm13 \n\t" // a * x
+ "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+
+ "movups %%xmm14 , %%xmm11 \n\t"
+ "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm14 \n\t" // a * x
+ "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+
+ "addq $2 , %0 \n\t"
+ "movups %%xmm15 , %%xmm11 \n\t"
+ "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm15 \n\t" // a * x
+ "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+
+ "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
+
+ "cmpq %0 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "movsd (%9), %%xmm4 \n\t" // temp1[0]
+ "movsd 8(%9), %%xmm5 \n\t" // temp1[1]
+ "movsd 16(%9), %%xmm6 \n\t" // temp1[2]
+ "movsd 24(%9), %%xmm7 \n\t" // temp1[3]
+
+ "haddpd %%xmm0, %%xmm0 \n\t"
+ "haddpd %%xmm1, %%xmm1 \n\t"
+ "haddpd %%xmm2, %%xmm2 \n\t"
+ "haddpd %%xmm3, %%xmm3 \n\t"
+
+ "addsd %%xmm4, %%xmm0 \n\t"
+ "addsd %%xmm5, %%xmm1 \n\t"
+ "addsd %%xmm6, %%xmm2 \n\t"
+ "addsd %%xmm7, %%xmm3 \n\t"
+
+ "movsd %%xmm0 , (%9) \n\t" // save temp2
+ "movsd %%xmm1 , 8(%9) \n\t" // save temp2
+ "movsd %%xmm2 , 16(%9) \n\t" // save temp2
+ "movsd %%xmm3 , 24(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (from), // 0
+ "r" (to), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a[0]), // 4
+ "r" (a[1]), // 5
+ "r" (a[2]), // 6
+ "r" (a[3]), // 7
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c
new file mode 100644
index 000000000..267755c2f
--- /dev/null
+++ b/kernel/x86_64/dsymv_U.c
@@ -0,0 +1,273 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER)
+#include "dsymv_U_microk_bulldozer-2.c"
+#elif defined(NEHALEM)
+#include "dsymv_U_microk_nehalem-2.c"
+#endif
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
+{
+ FLOAT at0,at1,at2,at3;
+ FLOAT x;
+ FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
+ FLOAT tp0;
+ FLOAT tp1;
+ FLOAT tp2;
+ FLOAT tp3;
+ BLASLONG i;
+
+ tp0 = temp1[0];
+ tp1 = temp1[1];
+ tp2 = temp1[2];
+ tp3 = temp1[3];
+
+ for (i=0; i<n; i++)
+ {
+ at0 = a0[i];
+ at1 = a1[i];
+ at2 = a2[i];
+ at3 = a3[i];
+ x = xp[i];
+ yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
+ tmp2[0] += at0 * x;
+ tmp2[1] += at1 * x;
+ tmp2[2] += at2 * x;
+ tmp2[3] += at3 * x;
+
+ }
+
+ temp2[0] += tmp2[0];
+ temp2[1] += tmp2[1];
+ temp2[2] += tmp2[2];
+ temp2[3] += tmp2[3];
+}
+
+#endif
+
+
+#ifndef HAVE_KERNEL_1x4
+
+static void dsymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
+{
+ FLOAT at0,at1,at2,at3;
+ FLOAT x;
+ FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
+ FLOAT tp0;
+ FLOAT tp1;
+ FLOAT tp2;
+ FLOAT tp3;
+ BLASLONG i;
+
+ tp0 = temp1[0];
+ tp1 = temp1[1];
+ tp2 = temp1[2];
+ tp3 = temp1[3];
+
+ for (i=from; i<to; i++)
+ {
+ at0 = a0[i];
+ at1 = a1[i];
+ at2 = a2[i];
+ at3 = a3[i];
+ x = xp[i];
+ yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
+ tmp2[0] += at0 * x;
+ tmp2[1] += at1 * x;
+ tmp2[2] += at2 * x;
+ tmp2[3] += at3 * x;
+
+ }
+
+ temp2[0] += tmp2[0];
+ temp2[1] += tmp2[1];
+ temp2[2] += tmp2[2];
+ temp2[3] += tmp2[3];
+}
+
+#endif
+
+
+static void dsymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
+{
+ FLOAT at0,at1,at2,at3;
+ FLOAT temp = 0.0;
+ FLOAT t1 = *temp1;
+ BLASLONG i;
+
+ for (i=0; i<(n/4)*4; i+=4)
+ {
+ at0 = a0[i];
+ at1 = a0[i+1];
+ at2 = a0[i+2];
+ at3 = a0[i+3];
+
+ yp[i] += t1 * at0;
+ temp += at0 * xp[i];
+ yp[i+1] += t1 * at1;
+ temp += at1 * xp[i+1];
+
+ yp[i+2] += t1 * at2;
+ temp += at2 * xp[i+2];
+ yp[i+3] += t1 * at3;
+ temp += at3 * xp[i+3];
+
+ }
+ *temp2 = temp;
+}
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG jx,jy;
+ BLASLONG j;
+ BLASLONG j1;
+ BLASLONG j2;
+ BLASLONG m2;
+ FLOAT temp1;
+ FLOAT temp2;
+ FLOAT *xp, *yp;
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT at0,at1,at2,at3;
+ FLOAT tmp1[4];
+ FLOAT tmp2[4];
+
+#if 0
+ if( m != offset )
+ printf("Symv_U: m=%d offset=%d\n",m,offset);
+#endif
+
+ BLASLONG m1 = m - offset;
+ BLASLONG mrange = m -m1;
+
+ if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
+ {
+
+ jx = m1 * inc_x;
+ jy = m1 * inc_y;
+
+ for (j=m1; j<m; j++)
+ {
+ temp1 = alpha * x[jx];
+ temp2 = 0.0;
+ iy = 0;
+ ix = 0;
+ for (i=0; i<j; i++)
+ {
+ y[iy] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[ix];
+ ix += inc_x;
+ iy += inc_y;
+
+ }
+ y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
+ jx += inc_x;
+ jy += inc_y;
+ }
+ return(0);
+ }
+
+ xp = x;
+ yp = y;
+
+ m2 = m - ( mrange % 4 );
+
+ for (j=m1; j<m2; j+=4)
+ {
+ tmp1[0] = alpha * xp[j];
+ tmp1[1] = alpha * xp[j+1];
+ tmp1[2] = alpha * xp[j+2];
+ tmp1[3] = alpha * xp[j+3];
+ tmp2[0] = 0.0;
+ tmp2[1] = 0.0;
+ tmp2[2] = 0.0;
+ tmp2[3] = 0.0;
+ a0 = &a[j*lda];
+ a1 = a0+lda;
+ a2 = a1+lda;
+ a3 = a2+lda;
+ j1 = (j/8)*8;
+ if ( j1 )
+ dsymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
+ if ( j1 < j )
+ dsymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
+
+ j2 = 0;
+ for ( j1 = j ; j1 < j+4 ; j1++ )
+ {
+ temp1 = tmp1[j2];
+ temp2 = tmp2[j2];
+ a0 = &a[j1*lda];
+ for ( i=j ; i<j1; i++ )
+ {
+ yp[i] += temp1 * a0[i];
+ temp2 += a0[i] * xp[i];
+
+ }
+ y[j1] += temp1 * a0[j1] + alpha * temp2;
+ j2++;
+
+ }
+
+ }
+
+ for ( ; j<m; j++)
+ {
+ temp1 = alpha * xp[j];
+ temp2 = 0.0;
+ a0 = &a[j*lda];
+ FLOAT at0;
+ j1 = (j/8)*8;
+
+ if ( j1 )
+ dsymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);
+
+ for (i=j1 ; i<j; i++)
+ {
+ at0 = a0[i];
+ yp[i] += temp1 * at0;
+ temp2 += at0 * xp[i];
+
+ }
+
+ yp[j] += temp1 * a0[j] + alpha * temp2;
+ }
+
+ return(0);
+
+
+}
+
+
diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
new file mode 100644
index 000000000..492920253
--- /dev/null
+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
@@ -0,0 +1,130 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vxorpd %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "vxorpd %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "vxorpd %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "vxorpd %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "vmovddup (%8), %%xmm4 \n\t" // temp1[0]
+ "vmovddup 8(%8), %%xmm5 \n\t" // temp1[1]
+ "vmovddup 16(%8), %%xmm6 \n\t" // temp1[1]
+ "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]
+
+ "xorq %0,%0 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
+ "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
+ "vmovups (%3,%0,8), %%xmm9 \n\t" // 2 * y
+
+ "vmovups (%5,%0,8), %%xmm13 \n\t" // 2 * a
+
+ "vfmaddpd %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups (%6,%0,8), %%xmm14 \n\t" // 2 * a
+
+ "vfmaddpd %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups (%7,%0,8), %%xmm15 \n\t" // 2 * a
+
+ "vmovups 16(%3,%0,8), %%xmm11 \n\t" // 2 * y
+ "vfmaddpd %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+ "vmovups 16(%4,%0,8), %%xmm12 \n\t" // 2 * a
+ "vfmaddpd %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups 16(%2,%0,8), %%xmm10 \n\t" // 2 * x
+
+ "vfmaddpd %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
+
+ "vmovups 16(%5,%0,8), %%xmm13 \n\t" // 2 * a
+ "vmovups 16(%6,%0,8), %%xmm14 \n\t" // 2 * a
+
+ "vfmaddpd %%xmm0 , %%xmm10, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm11 , %%xmm4, %%xmm12 , %%xmm11 \n\t" // y += temp1 * a
+
+ "vmovups 16(%7,%0,8), %%xmm15 \n\t" // 2 * a
+ "vfmaddpd %%xmm1 , %%xmm10, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm11 , %%xmm5, %%xmm13 , %%xmm11 \n\t" // y += temp1 * a
+
+ "vfmaddpd %%xmm2 , %%xmm10, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+ "addq $4 , %0 \n\t"
+ "vfmaddpd %%xmm11 , %%xmm6, %%xmm14 , %%xmm11 \n\t" // y += temp1 * a
+
+ "vfmaddpd %%xmm3 , %%xmm10, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+ "vfmaddpd %%xmm11 , %%xmm7, %%xmm15 , %%xmm11 \n\t" // y += temp1 * a
+ "subq $4 , %1 \n\t"
+
+ "vmovups %%xmm9 , -32(%3,%0,8) \n\t"
+ "vmovups %%xmm11 , -16(%3,%0,8) \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
+ "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"
+ "vhaddpd %%xmm2, %%xmm2, %%xmm2 \n\t"
+ "vhaddpd %%xmm3, %%xmm3, %%xmm3 \n\t"
+
+ "vmovsd %%xmm0 , (%9) \n\t" // save temp2
+ "vmovsd %%xmm1 , 8(%9) \n\t" // save temp2
+ "vmovsd %%xmm2 ,16(%9) \n\t" // save temp2
+ "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a0), // 4
+ "r" (a1), // 5
+ "r" (a2), // 6
+ "r" (a3), // 7
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
new file mode 100644
index 000000000..6aab57500
--- /dev/null
+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
@@ -0,0 +1,125 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void dsymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "xorpd %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "xorpd %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "xorpd %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "xorpd %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "movsd (%8), %%xmm4 \n\t" // temp1[0]
+ "movsd 8(%8), %%xmm5 \n\t" // temp1[1]
+ "movsd 16(%8), %%xmm6 \n\t" // temp1[2]
+ "movsd 24(%8), %%xmm7 \n\t" // temp1[3]
+ "shufpd $0, %%xmm4, %%xmm4 \n\t"
+ "shufpd $0, %%xmm5, %%xmm5 \n\t"
+ "shufpd $0, %%xmm6, %%xmm6 \n\t"
+ "shufpd $0, %%xmm7, %%xmm7 \n\t"
+
+ "xorq %0,%0 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
+ "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
+ "movups %%xmm12 , %%xmm11 \n\t"
+ "movups (%3,%0,8), %%xmm9 \n\t" // 2 * y
+ "movups (%5,%0,8), %%xmm13 \n\t" // 2 * a
+
+ "mulpd %%xmm4 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm12 \n\t" // a * x
+ "addpd %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+
+ "movups (%6,%0,8), %%xmm14 \n\t" // 2 * a
+ "movups (%7,%0,8), %%xmm15 \n\t" // 2 * a
+
+ "movups %%xmm13 , %%xmm11 \n\t"
+ "mulpd %%xmm5 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm13 \n\t" // a * x
+ "addpd %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+
+ "movups %%xmm14 , %%xmm11 \n\t"
+ "mulpd %%xmm6 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm14 \n\t" // a * x
+ "addpd %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+
+ "addq $2 , %0 \n\t"
+ "movups %%xmm15 , %%xmm11 \n\t"
+ "mulpd %%xmm7 , %%xmm11 \n\t" // temp1 * a
+ "addpd %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulpd %%xmm8 , %%xmm15 \n\t" // a * x
+ "addpd %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+
+ "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y
+
+ "subq $2 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "haddpd %%xmm0, %%xmm0 \n\t"
+ "haddpd %%xmm1, %%xmm1 \n\t"
+ "haddpd %%xmm2, %%xmm2 \n\t"
+ "haddpd %%xmm3, %%xmm3 \n\t"
+
+ "movsd %%xmm0 , (%9) \n\t" // save temp2
+ "movsd %%xmm1 , 8(%9) \n\t" // save temp2
+ "movsd %%xmm2 , 16(%9) \n\t" // save temp2
+ "movsd %%xmm3 , 24(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a0), // 4
+ "r" (a1), // 5
+ "r" (a2), // 6
+ "r" (a3), // 7
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c
new file mode 100644
index 000000000..da81f1354
--- /dev/null
+++ b/kernel/x86_64/saxpy.c
@@ -0,0 +1,103 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(NEHALEM)
+#include "saxpy_microk_nehalem-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void saxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG register i = 0;
+ FLOAT a = *alpha;
+
+ while(i < n)
+ {
+ y[i] += a * x[i];
+ y[i+1] += a * x[i+1];
+ y[i+2] += a * x[i+2];
+ y[i+3] += a * x[i+3];
+ y[i+4] += a * x[i+4];
+ y[i+5] += a * x[i+5];
+ y[i+6] += a * x[i+6];
+ y[i+7] += a * x[i+7];
+ i+=8 ;
+
+ }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ int n1 = n & -16;
+
+ if ( n1 )
+ saxpy_kernel_16(n1, x, y , &da );
+
+ i = n1;
+ while(i < n)
+ {
+
+ y[i] += da * x[i] ;
+ i++ ;
+
+ }
+ return(0);
+
+
+ }
+
+ while(i < n)
+ {
+
+ y[iy] += da * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c
new file mode 100644
index 000000000..14ff51a0d
--- /dev/null
+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c
@@ -0,0 +1,91 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movss (%4), %%xmm0 \n\t" // alpha
+ "shufps $0, %%xmm0, %%xmm0 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ // "prefetcht0 192(%2,%0,4) \n\t"
+ // "prefetcht0 192(%3,%0,4) \n\t"
+
+ "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+ "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
+ "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
+ "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
+ "movups (%3,%0,4), %%xmm8 \n\t" // 4 * y
+ "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * y
+ "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * y
+ "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * y
+
+ "mulps %%xmm0 , %%xmm12 \n\t" // alpha * x
+ "mulps %%xmm0 , %%xmm13 \n\t"
+ "mulps %%xmm0 , %%xmm14 \n\t"
+ "mulps %%xmm0 , %%xmm15 \n\t"
+
+ "addps %%xmm12, %%xmm8 \n\t" // y += alpha *x
+ "addps %%xmm13, %%xmm9 \n\t"
+ "addps %%xmm14, %%xmm10 \n\t"
+ "addps %%xmm15, %%xmm11 \n\t"
+
+ "movups %%xmm8 , (%3,%0,4) \n\t"
+ "movups %%xmm9 , 16(%3,%0,4) \n\t"
+ "movups %%xmm10, 32(%3,%0,4) \n\t"
+ "movups %%xmm11, 48(%3,%0,4) \n\t"
+
+ "addq $16, %0 \n\t"
+ "subq $16, %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (alpha) // 4
+ : "cc",
+ "%xmm0",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c
new file mode 100644
index 000000000..632d16810
--- /dev/null
+++ b/kernel/x86_64/sdot.c
@@ -0,0 +1,109 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sdot_microk_bulldozer-2.c"
+#elif defined(NEHALEM)
+#include "sdot_microk_nehalem-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+ BLASLONG register i = 0;
+ FLOAT dot = 0.0;
+
+ while(i < n)
+ {
+ dot += y[i] * x[i]
+ + y[i+1] * x[i+1]
+ + y[i+2] * x[i+2]
+ + y[i+3] * x[i+3]
+ + y[i+4] * x[i+4]
+ + y[i+5] * x[i+5]
+ + y[i+6] * x[i+6]
+ + y[i+7] * x[i+7] ;
+
+ i+=8 ;
+
+ }
+ *d += dot;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ FLOAT dot = 0.0 ;
+
+ if ( n <= 0 ) return(dot);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ int n1 = n & -16;
+
+ if ( n1 )
+ sdot_kernel_16(n1, x, y , &dot );
+
+
+ i = n1;
+ while(i < n)
+ {
+
+ dot += y[i] * x[i] ;
+ i++ ;
+
+ }
+ return(dot);
+
+
+ }
+
+ while(i < n)
+ {
+
+ dot += y[iy] * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(dot);
+
+}
+
+
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c
index 28e2fe4f6..024b2ce6d 100644
--- a/kernel/x86_64/dgemv_n_microk_haswell-2.c
+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c
@@ -25,48 +25,46 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_16x4 1
-static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+#define HAVE_KERNEL_16 1
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
-static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
{
+
BLASLONG register i = 0;
__asm__ __volatile__
(
- "vzeroupper \n\t"
- "vbroadcastsd (%2), %%ymm12 \n\t" // x0
- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
+ "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t"
+ "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t"
+ "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
+ "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+ "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
+ "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
+ "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
+
+ "vfmaddps %%xmm4, (%3,%0,4), %%xmm12, %%xmm4 \n\t" // 4 * y
+ "vfmaddps %%xmm5, 16(%3,%0,4), %%xmm13, %%xmm5 \n\t" // 4 * y
+ "vfmaddps %%xmm6, 32(%3,%0,4), %%xmm14, %%xmm6 \n\t" // 4 * y
+ "vfmaddps %%xmm7, 48(%3,%0,4), %%xmm15, %%xmm7 \n\t" // 4 * y
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "prefetcht0 192(%3,%0,8) \n\t"
- "vmovups (%3,%0,8), %%ymm4 \n\t" // 4 * y
- "vmovups 32(%3,%0,8), %%ymm5 \n\t" // 4 * y
+ "addq $16, %0 \n\t"
+ "subq $16, %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
- "prefetcht0 192(%4,%0,8) \n\t"
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
- "prefetcht0 192(%5,%0,8) \n\t"
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
- "prefetcht0 192(%6,%0,8) \n\t"
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
- "prefetcht0 192(%7,%0,8) \n\t"
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
+ "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
+ "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"
+ "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t"
- "vmovups %%ymm4, (%3,%0,8) \n\t" // 4 * y
- "vmovups %%ymm5, 32(%3,%0,8) \n\t" // 4 * y
+ "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
+ "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
- "addq $8 , %0 \n\t"
- "subq $8 , %1 \n\t"
- "jnz .L01LOOP%= \n\t"
- "vzeroupper \n\t"
+ "vmovss %%xmm4, (%4) \n\t"
:
:
@@ -74,12 +72,10 @@ static void dgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
+ "r" (dot) // 4
: "cc",
"%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c
new file mode 100644
index 000000000..2a918b5ea
--- /dev/null
+++ b/kernel/x86_64/sdot_microk_nehalem-2.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "xorps %%xmm4, %%xmm4 \n\t"
+ "xorps %%xmm5, %%xmm5 \n\t"
+ "xorps %%xmm6, %%xmm6 \n\t"
+ "xorps %%xmm7, %%xmm7 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+ "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
+ "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
+ "movups 16(%3,%0,4), %%xmm9 \n\t" // 4 * x
+ "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
+ "movups 32(%3,%0,4), %%xmm10 \n\t" // 4 * x
+ "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
+ "movups 48(%3,%0,4), %%xmm11 \n\t" // 4 * x
+
+ "mulps %%xmm8 , %%xmm12 \n\t"
+ "mulps %%xmm9 , %%xmm13 \n\t"
+ "mulps %%xmm10, %%xmm14 \n\t"
+ "mulps %%xmm11, %%xmm15 \n\t"
+
+ "addps %%xmm12, %%xmm4 \n\t"
+ "addps %%xmm13, %%xmm5 \n\t"
+ "addps %%xmm14, %%xmm6 \n\t"
+ "addps %%xmm15, %%xmm7 \n\t"
+
+ "addq $16, %0 \n\t"
+ "subq $16, %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "addps %%xmm5, %%xmm4 \n\t"
+ "addps %%xmm7, %%xmm6 \n\t"
+ "addps %%xmm6, %%xmm4 \n\t"
+
+ "haddps %%xmm4, %%xmm4 \n\t"
+ "haddps %%xmm4, %%xmm4 \n\t"
+
+ "movss %%xmm4, (%4) \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (dot) // 4
+ : "cc",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
index d88add02b..ef156fd27 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
@@ -181,8 +181,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
VFMADD231PS_( %ymm15,%ymm3,%ymm1 )
- addq $6*SIZE, BO
- addq $16*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 16*SIZE, AO
decq %rax
.endm
@@ -268,8 +268,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %ymm12,%ymm2,%ymm0 )
VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
- addq $6*SIZE, BO
- addq $8*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 8*SIZE, AO
decq %rax
.endm
@@ -327,8 +327,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231PS_( %xmm12,%xmm2,%xmm0 )
VFMADD231PS_( %xmm14,%xmm3,%xmm0 )
- addq $6*SIZE, BO
- addq $4*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 4*SIZE, AO
decq %rax
.endm
@@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
VFMADD231SS_( %xmm15,%xmm3,%xmm1 )
- addq $6*SIZE, BO
- addq $2*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 2*SIZE, AO
decq %rax
.endm
@@ -478,8 +478,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
- addq $6*SIZE, BO
- addq $1*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 1*SIZE, AO
decq %rax
.endm
diff --git a/kernel/x86_64/sgemv_n.c b/kernel/x86_64/sgemv_n.c
index faa8e1f8c..36a64219a 100644
--- a/kernel/x86_64/sgemv_n.c
+++ b/kernel/x86_64/sgemv_n.c
@@ -29,17 +29,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_n_microk_bulldozer-2.c"
-#elif defined(HASWELL)
-#include "sgemv_n_microk_haswell-2.c"
-#elif defined(SANDYBRIDGE)
-#include "sgemv_n_microk_sandy-2.c"
-#elif defined(NEHALEM)
-#include "sgemv_n_microk_nehalem-2.c"
-#endif
-
-
#define NBMAX 4096
#ifndef HAVE_KERNEL_16x4
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
new file mode 100644
index 000000000..0135306af
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -0,0 +1,591 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_n_microk_bulldozer-4.c"
+#elif defined(NEHALEM)
+#include "sgemv_n_microk_nehalem-4.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_n_microk_sandy-4.c"
+#elif defined(HASWELL)
+#include "sgemv_n_microk_haswell-4.c"
+#endif
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x8
+
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT *b0,*b1,*b2,*b3;
+ FLOAT *x4;
+ FLOAT x[8];
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+ b0 = a0 + lda4 ;
+ b1 = a1 + lda4 ;
+ b2 = a2 + lda4 ;
+ b3 = a3 + lda4 ;
+ x4 = x + 4;
+
+ for ( i=0; i<8; i++)
+ x[i] = xo[i] * *alpha;
+
+ for ( i=0; i< n; i+=4 )
+ {
+
+ y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
+ y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
+ y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
+ y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
+
+ y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];
+ y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];
+ y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];
+ y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];
+
+ }
+}
+
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT x[4];
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+
+ for ( i=0; i<4; i++)
+ x[i] = xo[i] * *alpha;
+
+ for ( i=0; i< n; i+=4 )
+ {
+ y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
+ y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
+ y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
+ y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
+ }
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movss (%2) , %%xmm12 \n\t" // x0
+ "movss (%6) , %%xmm4 \n\t" // alpha
+ "movss 4(%2) , %%xmm13 \n\t" // x1
+ "mulss %%xmm4 , %%xmm12 \n\t" // alpha
+ "mulss %%xmm4 , %%xmm13 \n\t" // alpha
+ "shufps $0, %%xmm12, %%xmm12 \n\t"
+ "shufps $0, %%xmm13, %%xmm13 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
+
+ "movups (%4,%0,4), %%xmm8 \n\t"
+ "movups (%5,%0,4), %%xmm9 \n\t"
+ "mulps %%xmm12, %%xmm8 \n\t"
+ "mulps %%xmm13, %%xmm9 \n\t"
+ "addps %%xmm8 , %%xmm4 \n\t"
+ "addq $4 , %0 \n\t"
+ "addps %%xmm9 , %%xmm4 \n\t"
+
+ "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y
+
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (alpha) // 6
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+ BLASLONG register n1 = n & -8 ;
+ BLASLONG register n2 = n & 4 ;
+
+ __asm__ __volatile__
+ (
+ "movss (%2), %%xmm12 \n\t" // x0
+ "mulss (%6), %%xmm12 \n\t" // alpha
+ "shufps $0, %%xmm12, %%xmm12 \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
+ "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
+ "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
+ "movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a
+ "mulps %%xmm12, %%xmm8 \n\t"
+ "mulps %%xmm12, %%xmm9 \n\t"
+ "addps %%xmm4 , %%xmm8 \n\t"
+ "addps %%xmm5 , %%xmm9 \n\t"
+
+ "addq $8 , %0 \n\t"
+ "movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y
+ "movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y
+
+ "subq $8 , %1 \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ ".L16END%=: \n\t"
+
+ "testq $0x04, %5 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
+ "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
+ "mulps %%xmm12, %%xmm8 \n\t"
+ "addps %%xmm8 , %%xmm4 \n\t"
+ "movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+ :
+ :
+ "r" (i), // 0
+ "r" (n1), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap), // 4
+ "r" (n2), // 5
+ "r" (alpha) // 6
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+#endif
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+ BLASLONG i;
+ if ( inc_dest != 1 )
+ {
+ for ( i=0; i<n; i++ )
+ {
+ *dest += *src;
+ src++;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%2,%0,4) , %%xmm12 \n\t"
+ "movups (%3,%0,4) , %%xmm11 \n\t"
+ "addps %%xmm12 , %%xmm11 \n\t"
+ "addq $4 , %0 \n\t"
+ "movups %%xmm11, -16(%3,%0,4) \n\t"
+
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (src), // 2
+ "r" (dest) // 3
+ : "cc",
+ "%xmm10", "%xmm11", "%xmm12",
+ "memory"
+ );
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ FLOAT *ap[4];
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ BLASLONG lda4 = lda << 2;
+ BLASLONG lda8 = lda << 3;
+ FLOAT xbuffer[8],*ybuffer;
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ ybuffer = buffer;
+
+ if ( inc_x == 1 )
+ {
+ n1 = n >> 3 ;
+ n2 = n & 7 ;
+ }
+ else
+ {
+ n1 = n >> 2 ;
+ n2 = n & 3 ;
+
+ }
+
+ m3 = m & 3 ;
+ m1 = m & -4 ;
+ m2 = (m & (NBMAX-1)) - m3 ;
+
+
+ y_ptr = y;
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ a_ptr = a;
+ x_ptr = x;
+
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+
+ if ( inc_y != 1 )
+ memset(ybuffer,0,NB*4);
+ else
+ ybuffer = y_ptr;
+
+ if ( inc_x == 1 )
+ {
+
+
+ for( i = 0; i < n1 ; i++)
+ {
+ sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+ ap[0] += lda8;
+ ap[1] += lda8;
+ ap[2] += lda8;
+ ap[3] += lda8;
+ a_ptr += lda8;
+ x_ptr += 8;
+ }
+
+
+ if ( n2 & 4 )
+ {
+ sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ a_ptr += lda4;
+ x_ptr += 4;
+ }
+
+ if ( n2 & 2 )
+ {
+ sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+ a_ptr += lda*2;
+ x_ptr += 2;
+ }
+
+
+ if ( n2 & 1 )
+ {
+ sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
+ a_ptr += lda;
+ x_ptr += 1;
+
+ }
+
+
+ }
+ else
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[1] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[2] = x_ptr[0];
+ x_ptr += inc_x;
+ xbuffer[3] = x_ptr[0];
+ x_ptr += inc_x;
+ sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ }
+
+ for( i = 0; i < n2 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ x_ptr += inc_x;
+ sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+ a_ptr += lda;
+
+ }
+
+ }
+
+ a += NB;
+ if ( inc_y != 1 )
+ {
+ add_y(NB,ybuffer,y_ptr,inc_y);
+ y_ptr += NB * inc_y;
+ }
+ else
+ y_ptr += NB ;
+
+ }
+
+ if ( m3 == 0 ) return(0);
+
+ if ( m3 == 3 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ FLOAT temp2 = 0.0;
+ if ( lda == 3 && inc_x ==1 )
+ {
+
+ for( i = 0; i < ( n & -4 ); i+=4 )
+ {
+
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+ temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+ temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
+ temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+ temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+ a_ptr += 12;
+ x_ptr += 4;
+ }
+
+ for( ; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ temp2 += a_ptr[2] * x_ptr[0];
+ a_ptr += 3;
+ x_ptr ++;
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ temp2 += a_ptr[2] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+
+
+ }
+
+ }
+ y_ptr[0] += alpha * temp0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp1;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp2;
+ return(0);
+ }
+
+
+ if ( m3 == 2 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ if ( lda == 2 && inc_x ==1 )
+ {
+
+ for( i = 0; i < (n & -4) ; i+=4 )
+ {
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+ temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+ a_ptr += 8;
+ x_ptr += 4;
+
+ }
+
+
+ for( ; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ a_ptr += 2;
+ x_ptr ++;
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp0 += a_ptr[0] * x_ptr[0];
+ temp1 += a_ptr[1] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+
+
+ }
+
+ }
+ y_ptr[0] += alpha * temp0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha * temp1;
+ return(0);
+ }
+
+ if ( m3 == 1 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp = 0.0;
+ if ( lda == 1 && inc_x ==1 )
+ {
+
+ for( i = 0; i < (n & -4); i+=4 )
+ {
+ temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+
+ }
+
+ for( ; i < n; i++ )
+ {
+ temp += a_ptr[i] * x_ptr[i];
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+ temp += a_ptr[0] * x_ptr[0];
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+ }
+ y_ptr[0] += alpha * temp;
+ return(0);
+ }
+
+
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_avx.c b/kernel/x86_64/sgemv_n_avx.c
deleted file mode 100644
index 57aaad4b4..000000000
--- a/kernel/x86_64/sgemv_n_avx.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_n_microk_bulldozer.c"
-#elif defined(HASWELL)
-#include "sgemv_n_microk_haswell.c"
-#else
-#include "sgemv_n_microk_sandy.c"
-#endif
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
- BLASLONG i;
- for ( i=0; i<n; i++ )
- {
- *dest = *src;
- dest++;
- src += inc_src;
- }
-}
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
- BLASLONG i;
- for ( i=0; i<n; i++ )
- {
- *dest += *src;
- src++;
- dest += inc_dest;
- }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- BLASLONG n1;
- BLASLONG m1;
- BLASLONG register m2;
- BLASLONG register n2;
- FLOAT *xbuffer,*ybuffer;
- xbuffer = buffer;
- ybuffer = xbuffer + 2048 + 256;
-
- n1 = n / 512 ;
- n2 = n % 512 ;
-
- m1 = m / 64;
- m2 = m % 64;
-
- y_ptr = y;
- x_ptr = x;
-
- for (j=0; j<n1; j++)
- {
-
- if ( inc_x == 1 )
- xbuffer = x_ptr;
- else
- copy_x(512,x_ptr,xbuffer,inc_x);
-
- a_ptr = a + j * 512 * lda;
- y_ptr = y;
-
- for(i = 0; i<m1; i++ )
- {
- sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(64,ybuffer,y_ptr,inc_y);
- y_ptr += 64 * inc_y;
- a_ptr += 64;
-
- }
-
- if ( m2 & 32 )
- {
- sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(32,ybuffer,y_ptr,inc_y);
- y_ptr += 32 * inc_y;
- a_ptr += 32;
-
- }
-
- if ( m2 & 16 )
- {
- sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(16,ybuffer,y_ptr,inc_y);
- y_ptr += 16 * inc_y;
- a_ptr += 16;
- }
- if ( m2 & 8 )
- {
- sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(8,ybuffer,y_ptr,inc_y);
- y_ptr += 8 * inc_y;
- a_ptr += 8;
- }
- if ( m2 & 4 )
- {
- sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(4,ybuffer,y_ptr,inc_y);
- y_ptr += 4 * inc_y;
- a_ptr += 4;
- }
- if ( m2 & 2 )
- {
- sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(2,ybuffer,y_ptr,inc_y);
- y_ptr += 2 * inc_y;
- a_ptr += 2;
- }
- if ( m2 & 1 )
- {
- sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(1,ybuffer,y_ptr,inc_y);
- }
- x_ptr += 512 * inc_x;
-
- }
-
- if ( n2 > 0 )
- {
-
- if ( inc_x == 1 )
- xbuffer = x_ptr;
- else
- copy_x(n2,x_ptr,xbuffer,inc_x);
-
- a_ptr = a + n1 * 512 * lda;
- y_ptr = y;
-
- for(i = 0; i<m1; i++ )
- {
- sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(64,ybuffer,y_ptr,inc_y);
- y_ptr += 64 * inc_y;
- a_ptr += 64;
-
- }
-
- if ( m2 & 32 )
- {
- sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(32,ybuffer,y_ptr,inc_y);
- y_ptr += 32 * inc_y;
- a_ptr += 32;
-
- }
- if ( m2 & 16 )
- {
- sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(16,ybuffer,y_ptr,inc_y);
- y_ptr += 16 * inc_y;
- a_ptr += 16;
- }
- if ( m2 & 8 )
- {
- sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(8,ybuffer,y_ptr,inc_y);
- y_ptr += 8 * inc_y;
- a_ptr += 8;
- }
- if ( m2 & 4 )
- {
- sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(4,ybuffer,y_ptr,inc_y);
- y_ptr += 4 * inc_y;
- a_ptr += 4;
- }
- if ( m2 & 2 )
- {
- sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(2,ybuffer,y_ptr,inc_y);
- y_ptr += 2 * inc_y;
- a_ptr += 2;
- }
- if ( m2 & 1 )
- {
- sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
- add_y(1,ybuffer,y_ptr,inc_y);
- }
-
-
- }
- return(0);
-}
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c b/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
deleted file mode 100644
index c4a490587..000000000
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-2.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vbroadcastss (%2), %%xmm12 \n\t" // x0
- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1
- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2
- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vmovups (%3,%0,4), %%xmm4 \n\t" // 4 * y
- "vmovups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
- "vmovups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y
- "vmovups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y
-
- "prefetcht0 192(%4,%0,4) \n\t"
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
- "prefetcht0 192(%5,%0,4) \n\t"
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
- "prefetcht0 192(%6,%0,4) \n\t"
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
- "prefetcht0 192(%7,%0,4) \n\t"
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
-
- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm6, 32(%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm7, 48(%3,%0,4) \n\t" // 4 * y
-
- "addq $16, %0 \n\t"
- "subq $16, %1 \n\t"
- "jnz .L01LOOP%= \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
new file mode 100644
index 000000000..40238be49
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@@ -0,0 +1,269 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vbroadcastss (%2), %%xmm12 \n\t" // x0
+ "vbroadcastss 4(%2), %%xmm13 \n\t" // x1
+ "vbroadcastss 8(%2), %%xmm14 \n\t" // x2
+ "vbroadcastss 12(%2), %%xmm15 \n\t" // x3
+ "vbroadcastss 16(%2), %%xmm0 \n\t" // x4
+ "vbroadcastss 20(%2), %%xmm1 \n\t" // x5
+ "vbroadcastss 24(%2), %%xmm2 \n\t" // x6
+ "vbroadcastss 28(%2), %%xmm3 \n\t" // x7
+
+ "vbroadcastss (%9), %%xmm8 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
+ "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
+
+ "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+ "addq $4 , %0 \n\t"
+
+ "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+ "addq $4 , %8 \n\t"
+
+ "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
+ "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
+ "subq $4 , %1 \n\t"
+ "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
+ "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
+
+ "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+
+ "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+ "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+ "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
+
+ "addq $8 , %0 \n\t"
+ "addq $8 , %8 \n\t"
+ "subq $8 , %1 \n\t"
+
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
+ "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
+ "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
+ "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
+
+ "prefetcht0 192(%4,%0,4) \n\t"
+ "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
+ "prefetcht0 192(%5,%0,4) \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "prefetcht0 192(%6,%0,4) \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
+ "prefetcht0 192(%7,%0,4) \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
+ ".align 2 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
+
+ "prefetcht0 192(%4,%8,4) \n\t"
+ "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
+ "prefetcht0 192(%5,%8,4) \n\t"
+ "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
+ "prefetcht0 192(%6,%8,4) \n\t"
+ "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
+ "prefetcht0 192(%7,%8,4) \n\t"
+ "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+
+ "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
+
+ "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+ "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+ "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
+ "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
+
+ "addq $16, %0 \n\t"
+ "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
+ "addq $16, %8 \n\t"
+ "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
+
+ "subq $16, %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L16END%=: \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (lda4), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vbroadcastss (%2), %%xmm12 \n\t" // x0
+ "vbroadcastss 4(%2), %%xmm13 \n\t" // x1
+ "vbroadcastss 8(%2), %%xmm14 \n\t" // x2
+ "vbroadcastss 12(%2), %%xmm15 \n\t" // x3
+
+ "vbroadcastss (%8), %%xmm8 \n\t" // alpha
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
+ "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
+
+ "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
+
+ "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t"
+ "vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer.c b/kernel/x86_64/sgemv_n_microk_bulldozer.c
deleted file mode 100644
index 1b07f0291..000000000
--- a/kernel/x86_64/sgemv_n_microk_bulldozer.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- float *pre = a + lda*3;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
- "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
- "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
- "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
- "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
- "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
- "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "nop \n\t"
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
- "prefetcht0 64(%%r8)\n\t" // Prefetch
- "vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
- "prefetcht0 128(%%r8)\n\t" // Prefetch
- "vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
- "vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
- "prefetcht0 192(%%r8)\n\t" // Prefetch
- "vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
- "vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
- "vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
- "vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
- "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
- "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
- "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
- "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
- "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
- "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
-
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
-static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- float *pre = a + lda*3;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero
- "vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero
- "vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero
- "vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
- "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
- "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
- "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
- "nop \n\t"
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
- "prefetcht0 64(%%r8)\n\t" // Prefetch
- "vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha
- "vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha
- "vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha
- "vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha
- "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
- "vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
- "vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha
- "vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha
-
- "vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- );
-
-}
-
-static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- float *pre = a + lda*3;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
- "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "addq $4 , %%rdi \n\t" // increment pointer of c
-
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
-
- "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
- "vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
-
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
- "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
-
- "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "addq $4 , %%rdi \n\t" // increment pointer of c
-
- "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
-
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
-
- "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
- "addq $4 , %%rdi \n\t" // increment pointer of c
-
- "vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
-
- "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
- "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vmovss (%%rdi), %%xmm0 \n\t" // load values of c
- "addq $4 , %%rdi \n\t" // increment pointer of c
-
- "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
- "vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
-
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
- "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
-
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
- "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
-static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vmovss (%%rdi), %%xmm0 \n\t" // load values of c
- "addq $4 , %%rdi \n\t" // increment pointer of c
-
- "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
-
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
-
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
new file mode 100644
index 000000000..8f56655a9
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -0,0 +1,299 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastss (%2), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
+ "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
+ "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
+ "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
+ "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
+
+ "vbroadcastss (%9), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+ "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
+ "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
+
+ "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+
+ "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
+ "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
+ "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
+
+ "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+
+ "addq $4 , %8 \n\t"
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+ "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
+
+ "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
+ "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
+
+ "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+
+ "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+
+ "addq $8 , %8 \n\t"
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
+ "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
+
+ "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
+ "addq $16, %0 \n\t"
+ "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
+ "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
+ "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
+
+ "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "addq $16, %8 \n\t"
+ "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
+ "subq $16, %1 \n\t"
+ "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
+
+ "jnz .L01LOOP%= \n\t"
+
+ ".L16END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (lda4), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastss (%2), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
+
+ "vbroadcastss (%8), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+
+ "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
+ "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
+ "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
+
+ "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+
+ "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
+
+ "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+ "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
+ "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
+
+ "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y
+
+ "addq $16, %0 \n\t"
+ "subq $16, %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L16END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_haswell.c b/kernel/x86_64/sgemv_n_microk_haswell.c
deleted file mode 100644
index 9db3869d2..000000000
--- a/kernel/x86_64/sgemv_n_microk_haswell.c
+++ /dev/null
@@ -1,461 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- float *pre = a + lda*2;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
- "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
- "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
- "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
- "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
- "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
- "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
- "vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
- "prefetcht0 64(%%r8)\n\t" // Prefetch
- "vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
- "vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
- "prefetcht0 128(%%r8)\n\t" // Prefetch
- "vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
- "vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
- "prefetcht0 192(%%r8)\n\t" // Prefetch
- "vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
- "vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
- "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
- "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
- "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
- "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
- "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
- "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
-
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
-static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- float *pre = a + lda*3;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
- "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
- "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "nop \n\t"
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
- "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
- "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
- "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
- "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
- "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
- "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
-
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "memory"
- );
-
-
-
-}
-
-static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- float *pre = a + lda*3;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "nop \n\t"
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
-
- "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-
- "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
-
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "memory"
- );
-
-
-}
-
-
-static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
-
- "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "memory"
- );
-
-
-}
-
-
-static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
-
- "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
- "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
-
- "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
- "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vmovss (%%rdi), %%xmm0 \n\t" // load values of c
-
- "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
- "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
-
- "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
- "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
- "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
-
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
- "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
-static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vmovss (%%rdi), %%xmm0 \n\t" // load values of c
- "addq $4 , %%rdi \n\t" // increment pointer of c
-
- "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
- "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
-
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
-
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-2.c b/kernel/x86_64/sgemv_n_microk_nehalem-2.c
deleted file mode 100644
index 40ccbb78f..000000000
--- a/kernel/x86_64/sgemv_n_microk_nehalem-2.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "movss (%2), %%xmm12 \n\t" // x0
- "movss 4(%2), %%xmm13 \n\t" // x1
- "movss 8(%2), %%xmm14 \n\t" // x2
- "movss 12(%2), %%xmm15 \n\t" // x3
- "shufps $0, %%xmm12, %%xmm12\n\t"
- "shufps $0, %%xmm13, %%xmm13\n\t"
- "shufps $0, %%xmm14, %%xmm14\n\t"
- "shufps $0, %%xmm15, %%xmm15\n\t"
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
- "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
- "movups 32(%3,%0,4), %%xmm6 \n\t" // 4 * y
- "movups 48(%3,%0,4), %%xmm7 \n\t" // 4 * y
-
- "prefetcht0 192(%4,%0,4) \n\t"
-
- "movups (%4,%0,4), %%xmm8 \n\t"
- "movups 16(%4,%0,4), %%xmm9 \n\t"
- "movups 32(%4,%0,4), %%xmm10 \n\t"
- "movups 48(%4,%0,4), %%xmm11 \n\t"
- "mulps %%xmm12, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm4 \n\t"
- "mulps %%xmm12, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm5 \n\t"
- "mulps %%xmm12, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm6 \n\t"
- "mulps %%xmm12, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm7 \n\t"
-
- "prefetcht0 192(%5,%0,4) \n\t"
-
- "movups (%5,%0,4), %%xmm8 \n\t"
- "movups 16(%5,%0,4), %%xmm9 \n\t"
- "movups 32(%5,%0,4), %%xmm10 \n\t"
- "movups 48(%5,%0,4), %%xmm11 \n\t"
- "mulps %%xmm13, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm4 \n\t"
- "mulps %%xmm13, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm5 \n\t"
- "mulps %%xmm13, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm6 \n\t"
- "mulps %%xmm13, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm7 \n\t"
-
- "prefetcht0 192(%6,%0,4) \n\t"
-
- "movups (%6,%0,4), %%xmm8 \n\t"
- "movups 16(%6,%0,4), %%xmm9 \n\t"
- "movups 32(%6,%0,4), %%xmm10 \n\t"
- "movups 48(%6,%0,4), %%xmm11 \n\t"
- "mulps %%xmm14, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm4 \n\t"
- "mulps %%xmm14, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm5 \n\t"
- "mulps %%xmm14, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm6 \n\t"
- "mulps %%xmm14, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm7 \n\t"
-
- "prefetcht0 192(%7,%0,4) \n\t"
-
- "movups (%7,%0,4), %%xmm8 \n\t"
- "movups 16(%7,%0,4), %%xmm9 \n\t"
- "movups 32(%7,%0,4), %%xmm10 \n\t"
- "movups 48(%7,%0,4), %%xmm11 \n\t"
- "mulps %%xmm15, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm4 \n\t"
- "mulps %%xmm15, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm5 \n\t"
- "mulps %%xmm15, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm6 \n\t"
- "mulps %%xmm15, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm7 \n\t"
-
-
- "movups %%xmm4, (%3,%0,4) \n\t" // 4 * y
- "movups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
- "movups %%xmm6, 32(%3,%0,4) \n\t" // 4 * y
- "movups %%xmm7, 48(%3,%0,4) \n\t" // 4 * y
-
- "addq $16, %0 \n\t"
- "subq $16, %1 \n\t"
- "jnz .L01LOOP%= \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
new file mode 100644
index 000000000..77a1b11aa
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -0,0 +1,204 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movss (%2), %%xmm12 \n\t" // x0
+ "movss 4(%2), %%xmm13 \n\t" // x1
+ "movss 8(%2), %%xmm14 \n\t" // x2
+ "movss 12(%2), %%xmm15 \n\t" // x3
+ "shufps $0, %%xmm12, %%xmm12\n\t"
+ "shufps $0, %%xmm13, %%xmm13\n\t"
+ "shufps $0, %%xmm14, %%xmm14\n\t"
+ "shufps $0, %%xmm15, %%xmm15\n\t"
+
+ "movss 16(%2), %%xmm0 \n\t" // x4
+ "movss 20(%2), %%xmm1 \n\t" // x5
+ "movss 24(%2), %%xmm2 \n\t" // x6
+ "movss 28(%2), %%xmm3 \n\t" // x7
+ "shufps $0, %%xmm0 , %%xmm0 \n\t"
+ "shufps $0, %%xmm1 , %%xmm1 \n\t"
+ "shufps $0, %%xmm2 , %%xmm2 \n\t"
+ "shufps $0, %%xmm3 , %%xmm3 \n\t"
+
+ "movss (%9), %%xmm6 \n\t" // alpha
+ "shufps $0, %%xmm6 , %%xmm6 \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "xorps %%xmm4 , %%xmm4 \n\t"
+ "xorps %%xmm5 , %%xmm5 \n\t"
+ "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+
+ ".align 2 \n\t"
+ "movups (%4,%0,4), %%xmm8 \n\t"
+ "movups (%5,%0,4), %%xmm9 \n\t"
+ "movups (%6,%0,4), %%xmm10 \n\t"
+ "movups (%7,%0,4), %%xmm11 \n\t"
+ ".align 2 \n\t"
+ "mulps %%xmm12, %%xmm8 \n\t"
+ "mulps %%xmm13, %%xmm9 \n\t"
+ "mulps %%xmm14, %%xmm10 \n\t"
+ "mulps %%xmm15, %%xmm11 \n\t"
+ "addps %%xmm8 , %%xmm4 \n\t"
+ "addps %%xmm9 , %%xmm5 \n\t"
+ "addps %%xmm10, %%xmm4 \n\t"
+ "addps %%xmm11, %%xmm5 \n\t"
+
+ "movups (%4,%8,4), %%xmm8 \n\t"
+ "movups (%5,%8,4), %%xmm9 \n\t"
+ "movups (%6,%8,4), %%xmm10 \n\t"
+ "movups (%7,%8,4), %%xmm11 \n\t"
+ ".align 2 \n\t"
+ "mulps %%xmm0 , %%xmm8 \n\t"
+ "mulps %%xmm1 , %%xmm9 \n\t"
+ "mulps %%xmm2 , %%xmm10 \n\t"
+ "mulps %%xmm3 , %%xmm11 \n\t"
+ "addps %%xmm8 , %%xmm4 \n\t"
+ "addps %%xmm9 , %%xmm5 \n\t"
+ "addps %%xmm10, %%xmm4 \n\t"
+ "addps %%xmm11, %%xmm5 \n\t"
+
+ "addq $4 , %8 \n\t"
+ "addps %%xmm5 , %%xmm4 \n\t"
+ "addq $4 , %0 \n\t"
+ "mulps %%xmm6 , %%xmm4 \n\t"
+ "subq $4 , %1 \n\t"
+ "addps %%xmm4 , %%xmm7 \n\t"
+
+ "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
+
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (lda4), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "movss (%2), %%xmm12 \n\t" // x0
+ "movss 4(%2), %%xmm13 \n\t" // x1
+ "movss 8(%2), %%xmm14 \n\t" // x2
+ "movss 12(%2), %%xmm15 \n\t" // x3
+ "shufps $0, %%xmm12, %%xmm12\n\t"
+ "shufps $0, %%xmm13, %%xmm13\n\t"
+ "shufps $0, %%xmm14, %%xmm14\n\t"
+ "shufps $0, %%xmm15, %%xmm15\n\t"
+
+ "movss (%8), %%xmm6 \n\t" // alpha
+ "shufps $0, %%xmm6 , %%xmm6 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "xorps %%xmm4 , %%xmm4 \n\t"
+ "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+
+ "movups (%4,%0,4), %%xmm8 \n\t"
+ "movups (%5,%0,4), %%xmm9 \n\t"
+ "movups (%6,%0,4), %%xmm10 \n\t"
+ "movups (%7,%0,4), %%xmm11 \n\t"
+ "mulps %%xmm12, %%xmm8 \n\t"
+ "mulps %%xmm13, %%xmm9 \n\t"
+ "mulps %%xmm14, %%xmm10 \n\t"
+ "mulps %%xmm15, %%xmm11 \n\t"
+ "addps %%xmm8 , %%xmm4 \n\t"
+ "addq $4 , %0 \n\t"
+ "addps %%xmm9 , %%xmm4 \n\t"
+ "subq $4 , %1 \n\t"
+ "addps %%xmm10 , %%xmm4 \n\t"
+ "addps %%xmm4 , %%xmm11 \n\t"
+
+ "mulps %%xmm6 , %%xmm11 \n\t"
+ "addps %%xmm7 , %%xmm11 \n\t"
+ "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y
+
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
new file mode 100644
index 000000000..c162eeeb6
--- /dev/null
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -0,0 +1,370 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastss (%2), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
+ "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
+ "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
+ "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
+ "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
+
+ "vbroadcastss (%9), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
+ "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
+ "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+
+ "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
+ "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
+ "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
+ "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
+ "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
+ "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
+ "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
+ "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
+
+ "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
+ "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
+ "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
+ "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
+ "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
+ "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
+ "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
+ "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
+
+ "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
+ "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
+ "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
+
+ "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+
+ "addq $4, %8 \n\t"
+ "addq $4, %0 \n\t"
+ "subq $4, %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+
+ "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
+ "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
+ "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
+ "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
+ "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
+ "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
+ "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
+
+ "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+
+ "addq $8, %8 \n\t"
+ "addq $8, %0 \n\t"
+ "subq $8, %1 \n\t"
+
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
+
+ "prefetcht0 192(%4,%0,4) \n\t"
+ "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
+ "prefetcht0 192(%5,%0,4) \n\t"
+ "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "prefetcht0 192(%6,%0,4) \n\t"
+ "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
+ "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
+ "prefetcht0 192(%7,%0,4) \n\t"
+ "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
+ "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "prefetcht0 192(%4,%8,4) \n\t"
+ "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
+ "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
+ "prefetcht0 192(%5,%8,4) \n\t"
+ "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
+ "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "prefetcht0 192(%6,%8,4) \n\t"
+ "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
+ "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
+ "prefetcht0 192(%7,%8,4) \n\t"
+ "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
+ "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
+ "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
+
+ "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
+ "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
+
+ "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
+
+ "addq $16, %8 \n\t"
+ "addq $16, %0 \n\t"
+ "subq $16, %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L16END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (lda4), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastss (%2), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
+
+ "vbroadcastss (%8), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+
+ "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
+ "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
+ "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
+ "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
+ "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
+ "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
+ "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
+ "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
+
+ "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t"
+ "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
+ "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
+
+ "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+
+ "addq $4, %0 \n\t"
+ "subq $4, %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+
+ "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
+ "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t"
+ "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
+ "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
+
+ "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+
+ "addq $8, %0 \n\t"
+ "subq $8, %1 \n\t"
+
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
+ "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
+ "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y
+ "vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y
+
+ "prefetcht0 192(%4,%0,4) \n\t"
+ "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
+ "prefetcht0 192(%5,%0,4) \n\t"
+ "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "prefetcht0 192(%6,%0,4) \n\t"
+ "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
+ "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
+ "prefetcht0 192(%7,%0,4) \n\t"
+ "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
+ "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
+ "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
+ "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+
+ "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
+ "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
+
+ "vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t"
+ "vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t"
+
+ "vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y
+
+ "addq $16, %0 \n\t"
+ "subq $16, %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L16END%=: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/sgemv_n_microk_sandy.c b/kernel/x86_64/sgemv_n_microk_sandy.c
deleted file mode 100644
index 9bdb06600..000000000
--- a/kernel/x86_64/sgemv_n_microk_sandy.c
+++ /dev/null
@@ -1,473 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- float *pre = a + lda*2;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
- "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
- "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
- "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero
- "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero
- "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero
- "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "nop \n\t"
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
- "prefetcht0 64(%%r8)\n\t" // Prefetch
- "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
- "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
- "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
- "prefetcht0 128(%%r8)\n\t" // Prefetch
- "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
- "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
- "prefetcht0 192(%%r8)\n\t" // Prefetch
- "vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
- "vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
- "vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
- "vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
- "vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
- "vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
- "vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
- "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
- "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
- "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha
- "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha
- "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha
- "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha
-
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
-static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- float *pre = a + lda*3;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
- "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero
- "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "nop \n\t"
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
- "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
- "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp
-
- "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
- "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp
-
-
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
- "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha
- "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha
-
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "memory"
- );
-
-
-
-}
-
-static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- float *pre = a + lda*3;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
- "movq %6, %%r8\n\t" // address for prefetch
- "prefetcht0 (%%r8)\n\t" // Prefetch
- "prefetcht0 64(%%r8)\n\t" // Prefetch
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
- "nop \n\t"
- "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch
-
- "prefetcht0 (%%r8)\n\t" // Prefetch
-
- "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
-
- "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha
-
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
- "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y), // 5
- "m" (pre) // 6
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "memory"
- );
-
-
-}
-
-
-static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c
-
- "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
- "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha
- "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "memory"
- );
-
-
-}
-
-
-static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c
-
- "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
- "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
-
- "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
- "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vmovss (%%rdi), %%xmm0 \n\t" // load values of c
-
- "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
- "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp
-
- "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
- "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp
-
- "addq $4 , %%rdi \n\t" // increment pointer of c
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
- "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha
-
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
- "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
-static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
-
- ".L01LOOP%=: \n\t"
- "vmovss (%%rdi), %%xmm0 \n\t" // load values of c
- "addq $4 , %%rdi \n\t" // increment pointer of c
-
- "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp
- "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp
-
- "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a
-
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha
-
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/sgemv_t.c b/kernel/x86_64/sgemv_t.c
index 532afee5d..f7643101e 100644
--- a/kernel/x86_64/sgemv_t.c
+++ b/kernel/x86_64/sgemv_t.c
@@ -28,16 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_t_microk_bulldozer-2.c"
-#elif defined(HASWELL)
-#include "sgemv_t_microk_haswell-2.c"
-#elif defined(SANDYBRIDGE)
-#include "sgemv_t_microk_sandy-2.c"
-#elif defined(NEHALEM)
-#include "sgemv_t_microk_nehalem-2.c"
-#endif
-
#define NBMAX 4096
#ifndef HAVE_KERNEL_16x4
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
new file mode 100644
index 000000000..b0e883252
--- /dev/null
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -0,0 +1,624 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(NEHALEM)
+#include "sgemv_t_microk_nehalem-4.c"
+#elif defined(BULLDOZER) || defined(PILEDRIVER)
+#include "sgemv_t_microk_bulldozer-4.c"
+#elif defined(SANDYBRIDGE)
+#include "sgemv_t_microk_sandy-4.c"
+#elif defined(HASWELL)
+#include "sgemv_t_microk_haswell-4.c"
+#endif
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x4
+
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+ FLOAT temp0 = 0.0;
+ FLOAT temp1 = 0.0;
+ FLOAT temp2 = 0.0;
+ FLOAT temp3 = 0.0;
+
+ for ( i=0; i< n; i+=4 )
+ {
+ temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
+ temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
+ temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
+ temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
+ }
+ y[0] = temp0;
+ y[1] = temp1;
+ y[2] = temp2;
+ y[3] = temp3;
+}
+
+#endif
+
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+ "xorps %%xmm10 , %%xmm10 \n\t"
+ "xorps %%xmm11 , %%xmm11 \n\t"
+
+ "testq $4 , %1 \n\t"
+ "jz .L01LABEL%= \n\t"
+
+ "movups (%5,%0,4) , %%xmm14 \n\t" // x
+ "movups (%3,%0,4) , %%xmm12 \n\t" // ap0
+ "movups (%4,%0,4) , %%xmm13 \n\t" // ap1
+ "mulps %%xmm14 , %%xmm12 \n\t"
+ "mulps %%xmm14 , %%xmm13 \n\t"
+ "addq $4 , %0 \n\t"
+ "addps %%xmm12 , %%xmm10 \n\t"
+ "subq $4 , %1 \n\t"
+ "addps %%xmm13 , %%xmm11 \n\t"
+
+ ".L01LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%5,%0,4) , %%xmm14 \n\t" // x
+ "movups (%3,%0,4) , %%xmm12 \n\t" // ap0
+ "movups (%4,%0,4) , %%xmm13 \n\t" // ap1
+ "mulps %%xmm14 , %%xmm12 \n\t"
+ "mulps %%xmm14 , %%xmm13 \n\t"
+ "addps %%xmm12 , %%xmm10 \n\t"
+ "addps %%xmm13 , %%xmm11 \n\t"
+
+ "movups 16(%5,%0,4) , %%xmm14 \n\t" // x
+ "movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0
+ "movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1
+ "mulps %%xmm14 , %%xmm12 \n\t"
+ "mulps %%xmm14 , %%xmm13 \n\t"
+ "addps %%xmm12 , %%xmm10 \n\t"
+ "addps %%xmm13 , %%xmm11 \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "haddps %%xmm10, %%xmm10 \n\t"
+ "haddps %%xmm11, %%xmm11 \n\t"
+ "haddps %%xmm10, %%xmm10 \n\t"
+ "haddps %%xmm11, %%xmm11 \n\t"
+
+ "movss %%xmm10, (%2) \n\t"
+ "movss %%xmm11,4(%2) \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (y), // 2
+ "r" (ap0), // 3
+ "r" (ap1), // 4
+ "r" (x) // 5
+ : "cc",
+ "%xmm4", "%xmm5", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+
+}
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+ "xorps %%xmm9 , %%xmm9 \n\t"
+ "xorps %%xmm10 , %%xmm10 \n\t"
+
+ "testq $4 , %1 \n\t"
+ "jz .L01LABEL%= \n\t"
+
+ "movups (%3,%0,4) , %%xmm12 \n\t"
+ "movups (%4,%0,4) , %%xmm11 \n\t"
+ "mulps %%xmm11 , %%xmm12 \n\t"
+ "addq $4 , %0 \n\t"
+ "addps %%xmm12 , %%xmm10 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L01LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L01END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%3,%0,4) , %%xmm12 \n\t"
+ "movups 16(%3,%0,4) , %%xmm14 \n\t"
+ "movups (%4,%0,4) , %%xmm11 \n\t"
+ "movups 16(%4,%0,4) , %%xmm13 \n\t"
+ "mulps %%xmm11 , %%xmm12 \n\t"
+ "mulps %%xmm13 , %%xmm14 \n\t"
+ "addq $8 , %0 \n\t"
+ "addps %%xmm12 , %%xmm10 \n\t"
+ "subq $8 , %1 \n\t"
+ "addps %%xmm14 , %%xmm9 \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ ".L01END%=: \n\t"
+
+ "addps %%xmm9 , %%xmm10 \n\t"
+ "haddps %%xmm10, %%xmm10 \n\t"
+ "haddps %%xmm10, %%xmm10 \n\t"
+
+ "movss %%xmm10, (%2) \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (y), // 2
+ "r" (ap), // 3
+ "r" (x) // 4
+ : "cc",
+ "%xmm9", "%xmm10" ,
+ "%xmm11", "%xmm12", "%xmm13", "%xmm14",
+ "memory"
+ );
+
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+ BLASLONG i;
+ for ( i=0; i<n; i++ )
+ {
+ *dest = *src;
+ dest++;
+ src += inc_src;
+ }
+}
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+
+ BLASLONG i;
+
+ if ( inc_dest != 1 )
+ {
+ for ( i=0; i<n; i++ )
+ {
+ *dest += src[i] * da;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+ "movss (%2) , %%xmm10 \n\t"
+ "shufps $0 , %%xmm10 , %%xmm10 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "movups (%3,%0,4) , %%xmm12 \n\t"
+ "movups (%4,%0,4) , %%xmm11 \n\t"
+ "mulps %%xmm10 , %%xmm12 \n\t"
+ "addq $4 , %0 \n\t"
+ "addps %%xmm12 , %%xmm11 \n\t"
+ "subq $4 , %1 \n\t"
+ "movups %%xmm11, -16(%4,%0,4) \n\t"
+
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (&da), // 2
+ "r" (src), // 3
+ "r" (dest) // 4
+ : "cc",
+ "%xmm10", "%xmm11", "%xmm12",
+ "memory"
+ );
+
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG register i;
+ BLASLONG register j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ BLASLONG n0;
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ FLOAT ybuffer[4],*xbuffer;
+ FLOAT *ytemp;
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ xbuffer = buffer;
+ ytemp = buffer + NBMAX;
+
+ n0 = n / NBMAX;
+ n1 = (n % NBMAX) >> 2 ;
+ n2 = n & 3 ;
+
+ m3 = m & 3 ;
+ m1 = m & -4 ;
+ m2 = (m & (NBMAX-1)) - m3 ;
+
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ y_ptr = y;
+ a_ptr = a;
+ x_ptr = x;
+
+ if ( inc_x == 1 )
+ xbuffer = x_ptr;
+ else
+ copy_x(NB,x_ptr,xbuffer,inc_x);
+
+
+ FLOAT *ap[4];
+ FLOAT *yp;
+ BLASLONG register lda4 = 4 * lda;
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+
+ if ( n0 > 0 )
+ {
+ BLASLONG nb1 = NBMAX / 4;
+ for( j=0; j<n0; j++)
+ {
+
+ yp = ytemp;
+ for( i = 0; i < nb1 ; i++)
+ {
+ sgemv_kernel_4x4(NB,ap,xbuffer,yp);
+ ap[0] += lda4 ;
+ ap[1] += lda4 ;
+ ap[2] += lda4 ;
+ ap[3] += lda4 ;
+ yp += 4;
+ }
+ add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
+ y_ptr += nb1 * inc_y * 4;
+ a_ptr += nb1 * lda4 ;
+
+ }
+
+ }
+
+
+ yp = ytemp;
+
+ for( i = 0; i < n1 ; i++)
+ {
+ sgemv_kernel_4x4(NB,ap,xbuffer,yp);
+ ap[0] += lda4 ;
+ ap[1] += lda4 ;
+ ap[2] += lda4 ;
+ ap[3] += lda4 ;
+ yp += 4;
+ }
+ if ( n1 > 0 )
+ {
+ add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
+ y_ptr += n1 * inc_y * 4;
+ a_ptr += n1 * lda4 ;
+ }
+
+ if ( n2 & 2 )
+ {
+
+ sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
+ a_ptr += lda * 2;
+ *y_ptr += ybuffer[0] * alpha;
+ y_ptr += inc_y;
+ *y_ptr += ybuffer[1] * alpha;
+ y_ptr += inc_y;
+
+ }
+
+ if ( n2 & 1 )
+ {
+
+ sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+ a_ptr += lda;
+ *y_ptr += ybuffer[0] * alpha;
+ y_ptr += inc_y;
+
+ }
+ a += NB;
+ x += NB * inc_x;
+ }
+
+ if ( m3 == 0 ) return(0);
+
+ x_ptr = x;
+ a_ptr = a;
+ if ( m3 == 3 )
+ {
+ FLOAT xtemp0 = *x_ptr * alpha;
+ x_ptr += inc_x;
+ FLOAT xtemp1 = *x_ptr * alpha;
+ x_ptr += inc_x;
+ FLOAT xtemp2 = *x_ptr * alpha;
+
+ FLOAT *aj = a_ptr;
+ y_ptr = y;
+
+ if ( lda == 3 && inc_y == 1 )
+ {
+
+ for ( j=0; j< ( n & -4) ; j+=4 )
+ {
+
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+ y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+ y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+ y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+ aj += 12;
+ }
+
+ for ( ; j<n; j++ )
+ {
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+ aj += 3;
+ }
+
+ }
+ else
+ {
+
+ if ( inc_y == 1 )
+ {
+
+ BLASLONG register lda2 = lda << 1;
+ BLASLONG register lda4 = lda << 2;
+ BLASLONG register lda3 = lda2 + lda;
+
+ for ( j=0; j< ( n & -4 ); j+=4 )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+ y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
+ y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
+ y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
+ aj += lda4;
+ }
+
+ for ( ; j< n ; j++ )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
+ aj += lda;
+ }
+
+ }
+ else
+ {
+
+ for ( j=0; j<n; j++ )
+ {
+ *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
+ y_ptr += inc_y;
+ aj += lda;
+ }
+
+
+ }
+
+ }
+ return(0);
+ }
+
+ if ( m3 == 2 )
+ {
+ FLOAT xtemp0 = *x_ptr * alpha;
+ x_ptr += inc_x;
+ FLOAT xtemp1 = *x_ptr * alpha;
+
+ FLOAT *aj = a_ptr;
+ y_ptr = y;
+
+ if ( lda == 2 && inc_y == 1 )
+ {
+
+ for ( j=0; j< ( n & -4) ; j+=4 )
+ {
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+ y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
+ y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
+ y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
+ aj += 8;
+
+ }
+
+ for ( ; j<n; j++ )
+ {
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
+ aj += 2;
+ }
+
+ }
+ else
+ {
+ if ( inc_y == 1 )
+ {
+
+ BLASLONG register lda2 = lda << 1;
+ BLASLONG register lda4 = lda << 2;
+ BLASLONG register lda3 = lda2 + lda;
+
+ for ( j=0; j< ( n & -4 ); j+=4 )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+ y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
+ y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
+ y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
+ aj += lda4;
+ }
+
+ for ( ; j< n ; j++ )
+ {
+
+ y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+ aj += lda;
+ }
+
+ }
+ else
+ {
+ for ( j=0; j<n; j++ )
+ {
+ *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
+ y_ptr += inc_y;
+ aj += lda;
+ }
+ }
+
+ }
+ return(0);
+
+ }
+
+ FLOAT xtemp = *x_ptr * alpha;
+ FLOAT *aj = a_ptr;
+ y_ptr = y;
+ if ( lda == 1 && inc_y == 1 )
+ {
+ for ( j=0; j< ( n & -4) ; j+=4 )
+ {
+ y_ptr[j] += aj[j] * xtemp;
+ y_ptr[j+1] += aj[j+1] * xtemp;
+ y_ptr[j+2] += aj[j+2] * xtemp;
+ y_ptr[j+3] += aj[j+3] * xtemp;
+ }
+ for ( ; j<n ; j++ )
+ {
+ y_ptr[j] += aj[j] * xtemp;
+ }
+
+
+
+ }
+ else
+ {
+ if ( inc_y == 1 )
+ {
+
+ BLASLONG register lda2 = lda << 1;
+ BLASLONG register lda4 = lda << 2;
+ BLASLONG register lda3 = lda2 + lda;
+ for ( j=0; j< ( n & -4 ); j+=4 )
+ {
+ y_ptr[j] += *aj * xtemp;
+ y_ptr[j+1] += *(aj+lda) * xtemp;
+ y_ptr[j+2] += *(aj+lda2) * xtemp;
+ y_ptr[j+3] += *(aj+lda3) * xtemp;
+ aj += lda4 ;
+ }
+
+ for ( ; j<n; j++ )
+ {
+ y_ptr[j] += *aj * xtemp;
+ aj += lda;
+ }
+
+ }
+ else
+ {
+ for ( j=0; j<n; j++ )
+ {
+ *y_ptr += *aj * xtemp;
+ y_ptr += inc_y;
+ aj += lda;
+ }
+
+ }
+ }
+
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/sgemv_t_avx.c b/kernel/x86_64/sgemv_t_avx.c
deleted file mode 100644
index 55fb3d623..000000000
--- a/kernel/x86_64/sgemv_t_avx.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "sgemv_t_microk_bulldozer.c"
-#elif defined(HASWELL)
-#include "sgemv_t_microk_haswell.c"
-#else
-#include "sgemv_t_microk_sandy.c"
-#endif
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
- BLASLONG i;
- for ( i=0; i<n; i++ )
- {
- *dest = *src;
- dest++;
- src += inc_src;
- }
-}
-
-static void sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
-{
-
- FLOAT register temp0 = 0.0;
- BLASLONG i;
- for ( i=0; i<n ; i++)
- {
- temp0 += a[i] * x[i];
- }
- temp0 *= alpha ;
- *y += temp0;
-}
-
-
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- FLOAT *a_ptrl;
- BLASLONG m1;
- BLASLONG register m2;
- FLOAT *xbuffer;
- xbuffer = buffer;
- BLASLONG register Mblock;
-
- m1 = m / 1024 ;
- m2 = m % 1024 ;
-
- x_ptr = x;
- a_ptr = a;
-
- for (j=0; j<m1; j++)
- {
-
- if ( inc_x == 1 )
- xbuffer = x_ptr;
- else
- copy_x(1024,x_ptr,xbuffer,inc_x);
-
- y_ptr = y;
- a_ptrl = a_ptr;
-
- for(i = 0; i<n; i++ )
- {
- sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
- y_ptr += inc_y;
- a_ptrl += lda;
- }
- a_ptr += 1024;
- x_ptr += 1024 * inc_x;
- }
-
- if ( m2 == 0 ) return(0);
-
- Mblock = 512;
- while ( Mblock >= 16 )
- {
- if ( m2 & Mblock)
- {
-
- if ( inc_x == 1 )
- xbuffer = x_ptr;
- else
- copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
- y_ptr = y;
- a_ptrl = a_ptr;
-
- for(i = 0; i<n; i++ )
- {
- sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
- y_ptr += inc_y;
- a_ptrl += lda;
- }
- a_ptr += Mblock;
- x_ptr += Mblock * inc_x;
-
-
- }
- Mblock /= 2;
-
- }
-
- if ( m2 & Mblock)
- {
-
- if ( inc_x == 1 )
- xbuffer = x_ptr;
- else
- copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
- y_ptr = y;
- a_ptrl = a_ptr;
-
- for(i = 0; i<n; i++ )
- {
- sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
- y_ptr += inc_y;
- a_ptrl += lda;
- }
- a_ptr += Mblock;
- x_ptr += Mblock * inc_x;
-
-
- }
- Mblock /= 2;
-
-
- if ( m2 & Mblock)
- {
-
- if ( inc_x == 1 )
- xbuffer = x_ptr;
- else
- copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
- y_ptr = y;
- a_ptrl = a_ptr;
-
- for(i = 0; i<n; i++ )
- {
- sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
- y_ptr += inc_y;
- a_ptrl += lda;
- }
- a_ptr += Mblock;
- x_ptr += Mblock * inc_x;
-
-
- }
- Mblock /= 2;
-
- if ( m2 & Mblock)
- {
-
- if ( inc_x == 1 )
- xbuffer = x_ptr;
- else
- copy_x(Mblock,x_ptr,xbuffer,inc_x);
-
- y_ptr = y;
- a_ptrl = a_ptr;
-
- for(i = 0; i<n; i++ )
- {
- sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
- y_ptr += inc_y;
- a_ptrl += lda;
- }
- a_ptr += Mblock;
- x_ptr += Mblock * inc_x;
-
-
- }
- Mblock /= 2;
-
- if ( m2 & Mblock)
- {
-
- xbuffer = x_ptr;
-
- y_ptr = y;
- a_ptrl = a_ptr;
-
- for(i = 0; i<n; i++ )
- {
- sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
- y_ptr += inc_y;
- a_ptrl += lda;
- }
-
-
- }
-
- return(0);
-}
-
-
diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-2.c b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
index e4498afa3..40e318de3 100644
--- a/kernel/x86_64/sgemv_t_microk_bulldozer-2.c
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
@@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
@@ -40,38 +40,76 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%xmm6, %%xmm6, %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+ "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
+ "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+ "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
+ "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
+ "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
+ "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"prefetcht0 384(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t"
- "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
+ "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t"
"prefetcht0 384(%5,%0,4) \n\t"
+ ".align 2 \n\t"
"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
+ "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t"
"prefetcht0 384(%6,%0,4) \n\t"
+ ".align 2 \n\t"
"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t"
- "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
+ "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t"
"prefetcht0 384(%7,%0,4) \n\t"
"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 48(%5,%0,4), %%xmm15, %%xmm5 \n\t"
- "vfmaddps %%xmm6, 48(%6,%0,4), %%xmm15, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
+ "addq $16, %0 \n\t"
+ "vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t"
+ "vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t"
+ "subq $16, %1 \n\t"
+ "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t"
- "addq $16, %0 \n\t"
- "subq $16, %1 \n\t"
- "jnz .L01LOOP%= \n\t"
+ "jnz .L01LOOP%= \n\t"
+ ".L16END%=: \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"
diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer.c b/kernel/x86_64/sgemv_t_microk_bulldozer.c
deleted file mode 100644
index 56b12a1e8..000000000
--- a/kernel/x86_64/sgemv_t_microk_bulldozer.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- //n = n / 16;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
- "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
- "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
- "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
- "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
-
- "sarq $4, %%rax \n\t" // n = n / 16
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- // "prefetcht0 512(%%rsi) \n\t"
- "prefetcht0 (%%r8) \n\t" //prefetch next line of a
- "vmovups (%%rsi), %%xmm4 \n\t"
- "vmovups 4*4(%%rsi), %%xmm5 \n\t"
- "vmovups 8*4(%%rsi), %%xmm6 \n\t"
- "vmovups 12*4(%%rsi), %%xmm7 \n\t"
-
- "vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
- "vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
-
- "addq $16*4 , %%r8 \n\t" // increment prefetch pointer
- "addq $16*4 , %%rsi \n\t" // increment pointer of a
- "addq $16*4 , %%rdi \n\t" // increment pointer of c
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
- "vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
- "vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
- "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
- "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
-
- "vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
diff --git a/kernel/x86_64/sgemv_t_microk_haswell-2.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c
index e6d47270d..016cb35e7 100644
--- a/kernel/x86_64/sgemv_t_microk_haswell-2.c
+++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c
@@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
@@ -41,6 +41,40 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+
+ "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%xmm12, %%xmm6 \n\t"
+ "vfmadd231ps (%7,%0,4), %%xmm12, %%xmm7 \n\t"
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
+
+ "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 384(%2,%0,4) \n\t"
@@ -64,6 +98,8 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
+ ".L16END%=: \n\t"
+
"vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t"
"vextractf128 $1 , %%ymm6, %%xmm14 \n\t"
diff --git a/kernel/x86_64/sgemv_t_microk_haswell.c b/kernel/x86_64/sgemv_t_microk_haswell.c
deleted file mode 100644
index ecb9845bb..000000000
--- a/kernel/x86_64/sgemv_t_microk_haswell.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- //n = n / 16;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
- "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
- "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
- "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
- "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
-
- "sarq $4, %%rax \n\t" // n = n / 16
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- // "prefetcht0 512(%%rsi) \n\t"
- "prefetcht0 (%%r8) \n\t" //prefetch next line of a
- "vmovups (%%rsi), %%xmm4 \n\t"
- "vmovups 4*4(%%rsi), %%xmm5 \n\t"
- "vmovups 8*4(%%rsi), %%xmm6 \n\t"
- "vmovups 12*4(%%rsi), %%xmm7 \n\t"
-
- "vfmadd231ps 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
- "vfmadd231ps 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
- "vfmadd231ps 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
- "vfmadd231ps 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp
-
- "addq $16*4 , %%r8 \n\t" // increment prefetch pointer
- "addq $16*4 , %%rsi \n\t" // increment pointer of a
- "addq $16*4 , %%rdi \n\t" // increment pointer of c
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
- "vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
- "vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
- "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
- "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12\n\t"
- "vaddss (%%rdx), %%xmm12,%%xmm12\n\t"
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-2.c b/kernel/x86_64/sgemv_t_microk_nehalem-2.c
deleted file mode 100644
index db5a1448b..000000000
--- a/kernel/x86_64/sgemv_t_microk_nehalem-2.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "xorps %%xmm0 , %%xmm0 \n\t"
- "xorps %%xmm1 , %%xmm1 \n\t"
- "xorps %%xmm2 , %%xmm2 \n\t"
- "xorps %%xmm3 , %%xmm3 \n\t"
- "xorps %%xmm4 , %%xmm4 \n\t"
- "xorps %%xmm5 , %%xmm5 \n\t"
- "xorps %%xmm6 , %%xmm6 \n\t"
- "xorps %%xmm7 , %%xmm7 \n\t"
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "prefetcht0 384(%2,%0,4) \n\t"
- "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
- "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
- "movups (%4,%0,4), %%xmm8 \n\t"
- "movups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
- "movups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x
-
- "prefetcht0 384(%4,%0,4) \n\t"
-
- "movups 16(%4,%0,4), %%xmm9 \n\t"
- "movups 32(%4,%0,4), %%xmm10 \n\t"
- "movups 48(%4,%0,4), %%xmm11 \n\t"
- "mulps %%xmm12, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm0 \n\t"
- "mulps %%xmm13, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm4 \n\t"
- "movups (%5,%0,4), %%xmm8 \n\t"
- "mulps %%xmm14, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm0 \n\t"
- "mulps %%xmm15, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm4 \n\t"
-
- "prefetcht0 384(%5,%0,4) \n\t"
-
- "movups 16(%5,%0,4), %%xmm9 \n\t"
- "movups 32(%5,%0,4), %%xmm10 \n\t"
- "movups 48(%5,%0,4), %%xmm11 \n\t"
- "mulps %%xmm12, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm1 \n\t"
- "mulps %%xmm13, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm5 \n\t"
- "movups (%6,%0,4), %%xmm8 \n\t"
- "mulps %%xmm14, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm1 \n\t"
- "mulps %%xmm15, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm5 \n\t"
-
- "prefetcht0 384(%6,%0,4) \n\t"
-
- "movups 16(%6,%0,4), %%xmm9 \n\t"
- "movups 32(%6,%0,4), %%xmm10 \n\t"
- "movups 48(%6,%0,4), %%xmm11 \n\t"
- "mulps %%xmm12, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm2 \n\t"
- "mulps %%xmm13, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm6 \n\t"
- "movups (%7,%0,4), %%xmm8 \n\t"
- "mulps %%xmm14, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm2 \n\t"
- "mulps %%xmm15, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm6 \n\t"
-
- "prefetcht0 384(%7,%0,4) \n\t"
-
- "movups 16(%7,%0,4), %%xmm9 \n\t"
- "movups 32(%7,%0,4), %%xmm10 \n\t"
- "movups 48(%7,%0,4), %%xmm11 \n\t"
- "mulps %%xmm12, %%xmm8 \n\t"
- "addps %%xmm8 , %%xmm3 \n\t"
- "mulps %%xmm13, %%xmm9 \n\t"
- "addps %%xmm9 , %%xmm7 \n\t"
- "mulps %%xmm14, %%xmm10 \n\t"
- "addps %%xmm10, %%xmm3 \n\t"
- "mulps %%xmm15, %%xmm11 \n\t"
- "addps %%xmm11, %%xmm7 \n\t"
-
- "addq $16, %0 \n\t"
- "subq $16, %1 \n\t"
- "jnz .L01LOOP%= \n\t"
-
- "addps %%xmm0, %%xmm4 \n\t"
- "addps %%xmm1, %%xmm5 \n\t"
- "addps %%xmm2, %%xmm6 \n\t"
- "addps %%xmm3, %%xmm7 \n\t"
-
- "haddps %%xmm4, %%xmm4 \n\t"
- "haddps %%xmm5, %%xmm5 \n\t"
- "haddps %%xmm6, %%xmm6 \n\t"
- "haddps %%xmm7, %%xmm7 \n\t"
-
- "haddps %%xmm4, %%xmm4 \n\t"
- "haddps %%xmm5, %%xmm5 \n\t"
- "haddps %%xmm6, %%xmm6 \n\t"
- "haddps %%xmm7, %%xmm7 \n\t"
-
- "movss %%xmm4, (%3) \n\t"
- "movss %%xmm5, 4(%3) \n\t"
- "movss %%xmm6, 8(%3) \n\t"
- "movss %%xmm7, 12(%3) \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-2.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c
index b255ddbcb..4a167900e 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-2.c
+++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c
@@ -25,55 +25,57 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
__asm__ __volatile__
(
- "vzeroupper \n\t"
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
+ "xorps %%xmm4 , %%xmm4 \n\t"
+ "xorps %%xmm5 , %%xmm5 \n\t"
+ "xorps %%xmm6 , %%xmm6 \n\t"
+ "xorps %%xmm7 , %%xmm7 \n\t"
".align 16 \n\t"
".L01LOOP%=: \n\t"
- "vmovups (%3,%0,4), %%ymm4 \n\t" // 8 * y
- "vmovups 32(%3,%0,4), %%ymm5 \n\t" // 8 * y
- "prefetcht0 192(%4,%0,4) \n\t"
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
- "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
- "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
- "prefetcht0 192(%5,%0,4) \n\t"
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
- "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
- "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 192(%6,%0,4) \n\t"
- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
- "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
- "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
- "prefetcht0 192(%7,%0,4) \n\t"
- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
- "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
- "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
+ "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+ "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0
+ "movups (%5,%0,4), %%xmm9 \n\t" // 4 * a1
+ "movups (%6,%0,4), %%xmm10 \n\t" // 4 * a2
+ "movups (%7,%0,4), %%xmm11 \n\t" // 4 * a3
- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
+ "mulps %%xmm12, %%xmm8 \n\t"
+ "mulps %%xmm12, %%xmm9 \n\t"
+ "mulps %%xmm12, %%xmm10 \n\t"
+ "mulps %%xmm12, %%xmm11 \n\t"
+ "addps %%xmm8 , %%xmm4 \n\t"
+ "addq $4 , %0 \n\t"
+ "addps %%xmm9 , %%xmm5 \n\t"
+ "subq $4 , %1 \n\t"
+ "addps %%xmm10, %%xmm6 \n\t"
+ "addps %%xmm11, %%xmm7 \n\t"
- "addq $16, %0 \n\t"
- "subq $16, %1 \n\t"
- "jnz .L01LOOP%= \n\t"
- "vzeroupper \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "haddps %%xmm4, %%xmm4 \n\t"
+ "haddps %%xmm5, %%xmm5 \n\t"
+ "haddps %%xmm6, %%xmm6 \n\t"
+ "haddps %%xmm7, %%xmm7 \n\t"
+
+ "haddps %%xmm4, %%xmm4 \n\t"
+ "haddps %%xmm5, %%xmm5 \n\t"
+ "haddps %%xmm6, %%xmm6 \n\t"
+ "haddps %%xmm7, %%xmm7 \n\t"
+
+ "movss %%xmm4, (%3) \n\t"
+ "movss %%xmm5, 4(%3) \n\t"
+ "movss %%xmm6, 8(%3) \n\t"
+ "movss %%xmm7, 12(%3) \n\t"
:
:
@@ -86,9 +88,9 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
- "%xmm4", "%xmm5",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "%xmm12",
"memory"
);
diff --git a/kernel/x86_64/sgemv_t_microk_sandy-2.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c
index 841522302..6550518f7 100644
--- a/kernel/x86_64/sgemv_t_microk_sandy-2.c
+++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c
@@ -25,10 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_16x4 1
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+#define HAVE_KERNEL_4x4 1
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
@@ -45,6 +45,46 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"
+ "testq $0x04, %1 \n\t"
+ "jz .L08LABEL%= \n\t"
+
+ "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
+
+ "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
+ "vmulps (%5,%0,4), %%xmm12, %%xmm10 \n\t"
+ "vmulps (%6,%0,4), %%xmm12, %%xmm9 \n\t"
+ "vmulps (%7,%0,4), %%xmm12, %%xmm11 \n\t"
+ "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
+ "addq $4 , %0 \n\t"
+ "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
+ "vaddps %%xmm6, %%xmm9 , %%xmm6 \n\t"
+ "subq $4 , %1 \n\t"
+ "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t"
+
+ ".L08LABEL%=: \n\t"
+
+ "testq $0x08, %1 \n\t"
+ "jz .L16LABEL%= \n\t"
+
+ "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
+
+ "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t"
+ "vmulps (%6,%0,4), %%ymm12, %%ymm9 \n\t"
+ "vmulps (%7,%0,4), %%ymm12, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "addq $8 , %0 \n\t"
+ "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
+ "vaddps %%ymm6, %%ymm9 , %%ymm6 \n\t"
+ "subq $8 , %1 \n\t"
+ "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t"
+
+ ".L16LABEL%=: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je .L16END%= \n\t"
+
+
".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 384(%2,%0,4) \n\t"
@@ -53,29 +93,31 @@ static void sgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"prefetcht0 384(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
- "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t"
- "vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t"
"prefetcht0 384(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t"
- "vaddps %%ymm1, %%ymm10, %%ymm1 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
+ "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
+ "vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t"
+ "vaddps %%ymm1, %%ymm10, %%ymm1 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 384(%6,%0,4) \n\t"
+ "prefetcht0 384(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t"
- "vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t"
"vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t"
- "vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t"
- "prefetcht0 384(%7,%0,4) \n\t"
+ "prefetcht0 384(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t"
- "vaddps %%ymm7, %%ymm10, %%ymm7 \n\t"
"vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t"
- "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t"
-
+ "vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t"
"addq $16, %0 \n\t"
+ "vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t"
+ "vaddps %%ymm7, %%ymm10, %%ymm7 \n\t"
"subq $16, %1 \n\t"
+ "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t"
+
"jnz .L01LOOP%= \n\t"
+ ".L16END%=: \n\t"
+
"vaddps %%ymm4, %%ymm0, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm1, %%ymm5 \n\t"
"vaddps %%ymm6, %%ymm2, %%ymm6 \n\t"
diff --git a/kernel/x86_64/sgemv_t_microk_sandy.c b/kernel/x86_64/sgemv_t_microk_sandy.c
deleted file mode 100644
index 4ecd6d3d0..000000000
--- a/kernel/x86_64/sgemv_t_microk_sandy.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
-{
-
- //n = n / 16;
-
- __asm__ __volatile__
- (
- "movq %0, %%rax\n\t" // n -> rax
- "vmovss %1, %%xmm1\n\t" // alpha -> xmm1
- "movq %2, %%rsi\n\t" // adress of a -> rsi
- "movq %3, %%rcx\n\t" // value of lda > rcx
- "movq %4, %%rdi\n\t" // adress of x -> rdi
- "movq %5, %%rdx\n\t" // adress of y -> rdx
-
- "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float
- "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line
-
- "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero
- "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero
- "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero
- "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero
-
- "sarq $4, %%rax \n\t" // n = n / 16
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- // "prefetcht0 512(%%rsi) \n\t"
- "prefetcht0 (%%r8) \n\t" //prefetch next line of a
- "vmovups (%%rsi), %%xmm4 \n\t"
- "vmovups 4*4(%%rsi), %%xmm5 \n\t"
- "vmovups 8*4(%%rsi), %%xmm6 \n\t"
- "vmovups 12*4(%%rsi), %%xmm7 \n\t"
-
- "vmulps 0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp
- "vmulps 4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp
- "vmulps 8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp
- "vmulps 12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp
-
- "vaddps %%xmm12, %%xmm8 , %%xmm12\n\t"
- "vaddps %%xmm13, %%xmm9 , %%xmm13\n\t"
- "vaddps %%xmm14, %%xmm10, %%xmm14\n\t"
- "vaddps %%xmm15, %%xmm11, %%xmm15\n\t"
-
- "addq $16*4 , %%r8 \n\t" // increment prefetch pointer
- "addq $16*4 , %%rsi \n\t" // increment pointer of a
- "addq $16*4 , %%rdi \n\t" // increment pointer of c
- "dec %%rax \n\t" // n = n -1
- "jnz .L01LOOP%= \n\t"
-
- "vaddps %%xmm12, %%xmm14, %%xmm12\n\t"
- "vaddps %%xmm13, %%xmm15, %%xmm13\n\t"
- "vaddps %%xmm12, %%xmm13, %%xmm12\n\t"
- "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
- "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t"
-
- "vmulss %%xmm12, %%xmm1, %%xmm12 \n\t"
- "vaddss (%%rdx), %%xmm12, %%xmm12\n\t"
- "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y
-
- :
- :
- "m" (n), // 0
- "m" (alpha), // 1
- "m" (a), // 2
- "m" (lda), // 3
- "m" (x), // 4
- "m" (y) // 5
- : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
-
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
new file mode 100644
index 000000000..096adc6ca
--- /dev/null
+++ b/kernel/x86_64/ssymv_L.c
@@ -0,0 +1,299 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(BULLDOZER)
+#include "ssymv_L_microk_bulldozer-2.c"
+#elif defined(NEHALEM)
+#include "ssymv_L_microk_nehalem-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_4x4
+
+static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *tmp1, FLOAT *temp2)
+{
+ FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
+ BLASLONG i;
+
+ for (i=from; i<to; i+=4)
+ {
+
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+ y[i] += tmp1[3] * ap[3][i];
+ tmp2[3] += ap[3][i] * x[i];
+
+ y[i+1] += tmp1[0] * ap[0][i+1];
+ tmp2[0] += ap[0][i+1] * x[i+1];
+ y[i+1] += tmp1[1] * ap[1][i+1];
+ tmp2[1] += ap[1][i+1] * x[i+1];
+ y[i+1] += tmp1[2] * ap[2][i+1];
+ tmp2[2] += ap[2][i+1] * x[i+1];
+ y[i+1] += tmp1[3] * ap[3][i+1];
+ tmp2[3] += ap[3][i+1] * x[i+1];
+
+ y[i+2] += tmp1[0] * ap[0][i+2];
+ tmp2[0] += ap[0][i+2] * x[i+2];
+ y[i+2] += tmp1[1] * ap[1][i+2];
+ tmp2[1] += ap[1][i+2] * x[i+2];
+ y[i+2] += tmp1[2] * ap[2][i+2];
+ tmp2[2] += ap[2][i+2] * x[i+2];
+ y[i+2] += tmp1[3] * ap[3][i+2];
+ tmp2[3] += ap[3][i+2] * x[i+2];
+
+ y[i+3] += tmp1[0] * ap[0][i+3];
+ tmp2[0] += ap[0][i+3] * x[i+3];
+ y[i+3] += tmp1[1] * ap[1][i+3];
+ tmp2[1] += ap[1][i+3] * x[i+3];
+ y[i+3] += tmp1[2] * ap[2][i+3];
+ tmp2[2] += ap[2][i+3] * x[i+3];
+ y[i+3] += tmp1[3] * ap[3][i+3];
+ tmp2[3] += ap[3][i+3] * x[i+3];
+
+ }
+
+ temp2[0] += tmp2[0];
+ temp2[1] += tmp2[1];
+ temp2[2] += tmp2[2];
+ temp2[3] += tmp2[3];
+}
+
+#endif
+
+
+
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG jx,jy;
+ BLASLONG j;
+ FLOAT temp1;
+ FLOAT temp2;
+ FLOAT tmp1[4];
+ FLOAT tmp2[4];
+ FLOAT *ap[4];
+
+#if 0
+ if ( m != offset )
+ printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+
+ if ( (inc_x != 1) || (inc_y != 1) )
+ {
+
+ jx = 0;
+ jy = 0;
+
+ for (j=0; j<offset; j++)
+ {
+ temp1 = alpha * x[jx];
+ temp2 = 0.0;
+ y[jy] += temp1 * a[j*lda+j];
+ iy = jy;
+ ix = jx;
+ for (i=j+1; i<m; i++)
+ {
+ ix += inc_x;
+ iy += inc_y;
+ y[iy] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[ix];
+
+ }
+ y[jy] += alpha * temp2;
+ jx += inc_x;
+ jy += inc_y;
+ }
+ return(0);
+ }
+
+ BLASLONG offset1 = (offset/4)*4;
+
+ for (j=0; j<offset1; j+=4)
+ {
+ tmp1[0] = alpha * x[j];
+ tmp1[1] = alpha * x[j+1];
+ tmp1[2] = alpha * x[j+2];
+ tmp1[3] = alpha * x[j+3];
+ tmp2[0] = 0.0;
+ tmp2[1] = 0.0;
+ tmp2[2] = 0.0;
+ tmp2[3] = 0.0;
+ ap[0] = &a[j*lda];
+ ap[1] = ap[0] + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+ y[j] += tmp1[0] * ap[0][j];
+ y[j+1] += tmp1[1] * ap[1][j+1];
+ y[j+2] += tmp1[2] * ap[2][j+2];
+ y[j+3] += tmp1[3] * ap[3][j+3];
+ BLASLONG from = j+1;
+ if ( m - from >=12 )
+ {
+ BLASLONG m2 = (m/4)*4;
+ for (i=j+1; i<j+4; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+ }
+
+ for (i=j+2; i<j+4; i++)
+ {
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+ }
+
+ for (i=j+3; i<j+4; i++)
+ {
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+ }
+
+ if ( m2 > j+4 )
+ ssymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);
+
+
+ for (i=m2; i<m; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+
+ y[i] += tmp1[3] * ap[3][i];
+ tmp2[3] += ap[3][i] * x[i];
+
+ }
+
+
+ }
+ else
+ {
+
+ for (i=j+1; i<j+4; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+ }
+
+ for (i=j+2; i<j+4; i++)
+ {
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+ }
+
+ for (i=j+3; i<j+4; i++)
+ {
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+ }
+
+ for (i=j+4; i<m; i++)
+ {
+ y[i] += tmp1[0] * ap[0][i];
+ tmp2[0] += ap[0][i] * x[i];
+
+ y[i] += tmp1[1] * ap[1][i];
+ tmp2[1] += ap[1][i] * x[i];
+
+ y[i] += tmp1[2] * ap[2][i];
+ tmp2[2] += ap[2][i] * x[i];
+
+ y[i] += tmp1[3] * ap[3][i];
+ tmp2[3] += ap[3][i] * x[i];
+
+ }
+
+ }
+ y[j] += alpha * tmp2[0];
+ y[j+1] += alpha * tmp2[1];
+ y[j+2] += alpha * tmp2[2];
+ y[j+3] += alpha * tmp2[3];
+ }
+
+
+ for (j=offset1; j<offset; j++)
+ {
+ temp1 = alpha * x[j];
+ temp2 = 0.0;
+ y[j] += temp1 * a[j*lda+j];
+ BLASLONG from = j+1;
+ if ( m - from >=8 )
+ {
+ BLASLONG j1 = ((from + 4)/4)*4;
+ BLASLONG j2 = (m/4)*4;
+ for (i=from; i<j1; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ for (i=j1; i<j2; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ for (i=j2; i<m; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ }
+ else
+ {
+ for (i=from; i<m; i++)
+ {
+ y[i] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[i];
+
+ }
+
+ }
+ y[j] += alpha * temp2;
+ }
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
new file mode 100644
index 000000000..c9206c1be
--- /dev/null
+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+
+ __asm__ __volatile__
+ (
+ "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0]
+ "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1]
+ "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[2]
+ "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3]
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a
+ "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
+ "vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y
+
+ "vmovups (%5,%0,4), %%xmm13 \n\t" // 2 * a
+
+ "vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+ "vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups (%6,%0,4), %%xmm14 \n\t" // 2 * a
+
+ "vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+ "vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
+ "vmovups (%7,%0,4), %%xmm15 \n\t" // 2 * a
+
+ "vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+ "vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
+
+ "vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+ "vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
+
+ "addq $4 , %0 \n\t"
+
+ "vmovups %%xmm9 , -16(%3,%0,4) \n\t"
+
+ "cmpq %0 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovss (%9), %%xmm4 \n\t"
+ "vmovss 4(%9), %%xmm5 \n\t"
+ "vmovss 8(%9), %%xmm6 \n\t"
+ "vmovss 12(%9), %%xmm7 \n\t"
+
+ "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
+ "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
+ "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
+ "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
+ "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
+ "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
+ "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
+ "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
+
+ "vaddss %%xmm4, %%xmm0, %%xmm0 \n\t"
+ "vaddss %%xmm5, %%xmm1, %%xmm1 \n\t"
+ "vaddss %%xmm6, %%xmm2, %%xmm2 \n\t"
+ "vaddss %%xmm7, %%xmm3, %%xmm3 \n\t"
+
+ "vmovss %%xmm0 , (%9) \n\t" // save temp2
+ "vmovss %%xmm1 , 4(%9) \n\t" // save temp2
+ "vmovss %%xmm2 , 8(%9) \n\t" // save temp2
+ "vmovss %%xmm3 ,12(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (from), // 0
+ "r" (to), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a[0]), // 4
+ "r" (a[1]), // 5
+ "r" (a[2]), // 6
+ "r" (a[3]), // 8
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
new file mode 100644
index 000000000..a1c62caf6
--- /dev/null
+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void ssymv_kernel_4x4( BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+
+ __asm__ __volatile__
+ (
+ "xorps %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "xorps %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "xorps %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "xorps %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "movss (%8), %%xmm4 \n\t" // temp1[0]
+ "movss 4(%8), %%xmm5 \n\t" // temp1[1]
+ "movss 8(%8), %%xmm6 \n\t" // temp1[2]
+ "movss 12(%8), %%xmm7 \n\t" // temp1[3]
+ "shufps $0, %%xmm4, %%xmm4 \n\t"
+ "shufps $0, %%xmm5, %%xmm5 \n\t"
+ "shufps $0, %%xmm6, %%xmm6 \n\t"
+ "shufps $0, %%xmm7, %%xmm7 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
+ "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
+
+ "movups (%4,%0,4), %%xmm12 \n\t" // 4 * a
+ "movups (%5,%0,4), %%xmm13 \n\t" // 4 * a
+
+ "movups %%xmm12 , %%xmm11 \n\t"
+ "mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm12 \n\t" // a * x
+ "addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+
+ "movups (%6,%0,4), %%xmm14 \n\t" // 4 * a
+ "movups (%7,%0,4), %%xmm15 \n\t" // 4 * a
+
+ "movups %%xmm13 , %%xmm11 \n\t"
+ "mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm13 \n\t" // a * x
+ "addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+
+ "movups %%xmm14 , %%xmm11 \n\t"
+ "mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm14 \n\t" // a * x
+ "addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+
+ "movups %%xmm15 , %%xmm11 \n\t"
+ "mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm15 \n\t" // a * x
+ "addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+
+ "movups %%xmm9, (%3,%0,4) \n\t" // 4 * y
+
+ "addq $4 , %0 \n\t"
+ "cmpq %0 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "movss (%9), %%xmm4 \n\t" // temp1[0]
+ "movss 4(%9), %%xmm5 \n\t" // temp1[1]
+ "movss 8(%9), %%xmm6 \n\t" // temp1[2]
+ "movss 12(%9), %%xmm7 \n\t" // temp1[3]
+
+ "haddps %%xmm0, %%xmm0 \n\t"
+ "haddps %%xmm1, %%xmm1 \n\t"
+ "haddps %%xmm2, %%xmm2 \n\t"
+ "haddps %%xmm3, %%xmm3 \n\t"
+ "haddps %%xmm0, %%xmm0 \n\t"
+ "haddps %%xmm1, %%xmm1 \n\t"
+ "haddps %%xmm2, %%xmm2 \n\t"
+ "haddps %%xmm3, %%xmm3 \n\t"
+
+ "addss %%xmm4, %%xmm0 \n\t"
+ "addss %%xmm5, %%xmm1 \n\t"
+ "addss %%xmm6, %%xmm2 \n\t"
+ "addss %%xmm7, %%xmm3 \n\t"
+
+ "movss %%xmm0 , (%9) \n\t" // save temp2
+ "movss %%xmm1 , 4(%9) \n\t" // save temp2
+ "movss %%xmm2 , 8(%9) \n\t" // save temp2
+ "movss %%xmm3 , 12(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (from), // 0
+ "r" (to), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a[0]), // 4
+ "r" (a[1]), // 5
+ "r" (a[2]), // 6
+ "r" (a[3]), // 7
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
new file mode 100644
index 000000000..61127aa3d
--- /dev/null
+++ b/kernel/x86_64/ssymv_U.c
@@ -0,0 +1,273 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER)
+#include "ssymv_U_microk_bulldozer-2.c"
+#elif defined(NEHALEM)
+#include "ssymv_U_microk_nehalem-2.c"
+#endif
+
+#ifndef HAVE_KERNEL_4x4
+
+static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
+{
+ FLOAT at0,at1,at2,at3;
+ FLOAT x;
+ FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
+ FLOAT tp0;
+ FLOAT tp1;
+ FLOAT tp2;
+ FLOAT tp3;
+ BLASLONG i;
+
+ tp0 = temp1[0];
+ tp1 = temp1[1];
+ tp2 = temp1[2];
+ tp3 = temp1[3];
+
+ for (i=0; i<n; i++)
+ {
+ at0 = a0[i];
+ at1 = a1[i];
+ at2 = a2[i];
+ at3 = a3[i];
+ x = xp[i];
+ yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
+ tmp2[0] += at0 * x;
+ tmp2[1] += at1 * x;
+ tmp2[2] += at2 * x;
+ tmp2[3] += at3 * x;
+
+ }
+
+ temp2[0] += tmp2[0];
+ temp2[1] += tmp2[1];
+ temp2[2] += tmp2[2];
+ temp2[3] += tmp2[3];
+}
+
+#endif
+
+
+#ifndef HAVE_KERNEL_1x4
+
+static void ssymv_kernel_1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
+{
+ FLOAT at0,at1,at2,at3;
+ FLOAT x;
+ FLOAT tmp2[4] = { 0.0, 0.0, 0.0, 0.0 };
+ FLOAT tp0;
+ FLOAT tp1;
+ FLOAT tp2;
+ FLOAT tp3;
+ BLASLONG i;
+
+ tp0 = temp1[0];
+ tp1 = temp1[1];
+ tp2 = temp1[2];
+ tp3 = temp1[3];
+
+ for (i=from; i<to; i++)
+ {
+ at0 = a0[i];
+ at1 = a1[i];
+ at2 = a2[i];
+ at3 = a3[i];
+ x = xp[i];
+ yp[i] += tp0 * at0 + tp1 *at1 + tp2 * at2 + tp3 * at3;
+ tmp2[0] += at0 * x;
+ tmp2[1] += at1 * x;
+ tmp2[2] += at2 * x;
+ tmp2[3] += at3 * x;
+
+ }
+
+ temp2[0] += tmp2[0];
+ temp2[1] += tmp2[1];
+ temp2[2] += tmp2[2];
+ temp2[3] += tmp2[3];
+}
+
+#endif
+
+
+static void ssymv_kernel_8x1(BLASLONG n, FLOAT *a0, FLOAT *xp, FLOAT *yp, FLOAT *temp1, FLOAT *temp2)
+{
+ FLOAT at0,at1,at2,at3;
+ FLOAT temp = 0.0;
+ FLOAT t1 = *temp1;
+ BLASLONG i;
+
+ for (i=0; i<(n/4)*4; i+=4)
+ {
+ at0 = a0[i];
+ at1 = a0[i+1];
+ at2 = a0[i+2];
+ at3 = a0[i+3];
+
+ yp[i] += t1 * at0;
+ temp += at0 * xp[i];
+ yp[i+1] += t1 * at1;
+ temp += at1 * xp[i+1];
+
+ yp[i+2] += t1 * at2;
+ temp += at2 * xp[i+2];
+ yp[i+3] += t1 * at3;
+ temp += at3 * xp[i+3];
+
+ }
+ *temp2 = temp;
+}
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG ix,iy;
+ BLASLONG jx,jy;
+ BLASLONG j;
+ BLASLONG j1;
+ BLASLONG j2;
+ BLASLONG m2;
+ FLOAT temp1;
+ FLOAT temp2;
+ FLOAT *xp, *yp;
+ FLOAT *a0,*a1,*a2,*a3;
+ FLOAT at0,at1,at2,at3;
+ FLOAT tmp1[4];
+ FLOAT tmp2[4];
+
+#if 0
+ if( m != offset )
+ printf("Symv_U: m=%d offset=%d\n",m,offset);
+#endif
+
+ BLASLONG m1 = m - offset;
+ BLASLONG mrange = m -m1;
+
+ if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
+ {
+
+ jx = m1 * inc_x;
+ jy = m1 * inc_y;
+
+ for (j=m1; j<m; j++)
+ {
+ temp1 = alpha * x[jx];
+ temp2 = 0.0;
+ iy = 0;
+ ix = 0;
+ for (i=0; i<j; i++)
+ {
+ y[iy] += temp1 * a[j*lda+i];
+ temp2 += a[j*lda+i] * x[ix];
+ ix += inc_x;
+ iy += inc_y;
+
+ }
+ y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
+ jx += inc_x;
+ jy += inc_y;
+ }
+ return(0);
+ }
+
+ xp = x;
+ yp = y;
+
+ m2 = m - ( mrange % 4 );
+
+ for (j=m1; j<m2; j+=4)
+ {
+ tmp1[0] = alpha * xp[j];
+ tmp1[1] = alpha * xp[j+1];
+ tmp1[2] = alpha * xp[j+2];
+ tmp1[3] = alpha * xp[j+3];
+ tmp2[0] = 0.0;
+ tmp2[1] = 0.0;
+ tmp2[2] = 0.0;
+ tmp2[3] = 0.0;
+ a0 = &a[j*lda];
+ a1 = a0+lda;
+ a2 = a1+lda;
+ a3 = a2+lda;
+ j1 = (j/8)*8;
+ if ( j1 )
+ ssymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
+ if ( j1 < j )
+ ssymv_kernel_1x4(j1, j, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
+
+ j2 = 0;
+ for ( j1 = j ; j1 < j+4 ; j1++ )
+ {
+ temp1 = tmp1[j2];
+ temp2 = tmp2[j2];
+ a0 = &a[j1*lda];
+ for ( i=j ; i<j1; i++ )
+ {
+ yp[i] += temp1 * a0[i];
+ temp2 += a0[i] * xp[i];
+
+ }
+ y[j1] += temp1 * a0[j1] + alpha * temp2;
+ j2++;
+
+ }
+
+ }
+
+ for ( ; j<m; j++)
+ {
+ temp1 = alpha * xp[j];
+ temp2 = 0.0;
+ a0 = &a[j*lda];
+ FLOAT at0;
+ j1 = (j/8)*8;
+
+ if ( j1 )
+ ssymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);
+
+ for (i=j1 ; i<j; i++)
+ {
+ at0 = a0[i];
+ yp[i] += temp1 * at0;
+ temp2 += at0 * xp[i];
+
+ }
+
+ yp[j] += temp1 * a0[j] + alpha * temp2;
+ }
+
+ return(0);
+
+
+}
+
+
diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
new file mode 100644
index 000000000..b8b3b73e9
--- /dev/null
+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
@@ -0,0 +1,114 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vxorps %%xmm0 , %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "vxorps %%xmm1 , %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "vxorps %%xmm2 , %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "vxorps %%xmm3 , %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "vbroadcastss (%8), %%xmm4 \n\t" // temp1[0]
+ "vbroadcastss 4(%8), %%xmm5 \n\t" // temp1[1]
+ "vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1]
+ "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1]
+
+ "xorq %0,%0 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x
+ "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y
+
+ "vmovups (%4,%0,4), %%xmm12 \n\t" // 4 * a
+ "vmovups (%5,%0,4), %%xmm13 \n\t" // 4 * a
+
+ "vfmaddps %%xmm0 , %%xmm8, %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+ "vfmaddps %%xmm9 , %%xmm4, %%xmm12 , %%xmm9 \n\t" // y += temp1 * a
+
+ "vfmaddps %%xmm1 , %%xmm8, %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+ "vmovups (%6,%0,4), %%xmm14 \n\t" // 4 * a
+ "vfmaddps %%xmm9 , %%xmm5, %%xmm13 , %%xmm9 \n\t" // y += temp1 * a
+
+ "vfmaddps %%xmm2 , %%xmm8, %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+ "vmovups (%7,%0,4), %%xmm15 \n\t" // 4 * a
+ "vfmaddps %%xmm9 , %%xmm6, %%xmm14 , %%xmm9 \n\t" // y += temp1 * a
+
+ "vfmaddps %%xmm3 , %%xmm8, %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+ "vfmaddps %%xmm9 , %%xmm7, %%xmm15 , %%xmm9 \n\t" // y += temp1 * a
+
+ "vmovups %%xmm9 , (%3,%0,4) \n\t"
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
+ "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
+ "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
+ "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
+ "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
+ "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"
+ "vhaddps %%xmm2, %%xmm2, %%xmm2 \n\t"
+ "vhaddps %%xmm3, %%xmm3, %%xmm3 \n\t"
+
+ "vmovss %%xmm0 , (%9) \n\t" // save temp2
+ "vmovss %%xmm1 , 4(%9) \n\t" // save temp2
+ "vmovss %%xmm2 , 8(%9) \n\t" // save temp2
+ "vmovss %%xmm3 ,12(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a0), // 4
+ "r" (a1), // 5
+ "r" (a2), // 6
+ "r" (a3), // 7
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
new file mode 100644
index 000000000..9505a395a
--- /dev/null
+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
@@ -0,0 +1,130 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void ssymv_kernel_4x4( BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) __attribute__ ((noinline));
+
+static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "xorps %%xmm0 , %%xmm0 \n\t" // temp2[0]
+ "xorps %%xmm1 , %%xmm1 \n\t" // temp2[1]
+ "xorps %%xmm2 , %%xmm2 \n\t" // temp2[2]
+ "xorps %%xmm3 , %%xmm3 \n\t" // temp2[3]
+ "movss (%8), %%xmm4 \n\t" // temp1[0]
+ "movss 4(%8), %%xmm5 \n\t" // temp1[1]
+ "movss 8(%8), %%xmm6 \n\t" // temp1[2]
+ "movss 12(%8), %%xmm7 \n\t" // temp1[3]
+ "shufps $0, %%xmm4, %%xmm4 \n\t"
+ "shufps $0, %%xmm5, %%xmm5 \n\t"
+ "shufps $0, %%xmm6, %%xmm6 \n\t"
+ "shufps $0, %%xmm7, %%xmm7 \n\t"
+
+ "xorq %0,%0 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
+ "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y
+
+ "movups (%4,%0,4), %%xmm12 \n\t" // 4 * a
+ "movups (%5,%0,4), %%xmm13 \n\t" // 4 * a
+
+ "movups %%xmm12 , %%xmm11 \n\t"
+ "mulps %%xmm4 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm12 \n\t" // a * x
+ "addps %%xmm12 , %%xmm0 \n\t" // temp2 += x * a
+
+ "movups (%6,%0,4), %%xmm14 \n\t" // 4 * a
+ "movups (%7,%0,4), %%xmm15 \n\t" // 4 * a
+
+ "movups %%xmm13 , %%xmm11 \n\t"
+ "mulps %%xmm5 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm13 \n\t" // a * x
+ "addps %%xmm13 , %%xmm1 \n\t" // temp2 += x * a
+
+ "movups %%xmm14 , %%xmm11 \n\t"
+ "mulps %%xmm6 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm14 \n\t" // a * x
+ "addps %%xmm14 , %%xmm2 \n\t" // temp2 += x * a
+
+ "movups %%xmm15 , %%xmm11 \n\t"
+ "mulps %%xmm7 , %%xmm11 \n\t" // temp1 * a
+ "addps %%xmm11 , %%xmm9 \n\t" // y += temp1 * a
+ "mulps %%xmm8 , %%xmm15 \n\t" // a * x
+ "addps %%xmm15 , %%xmm3 \n\t" // temp2 += x * a
+
+ "movups %%xmm9, (%3,%0,4) \n\t" // 4 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "haddps %%xmm0, %%xmm0 \n\t"
+ "haddps %%xmm1, %%xmm1 \n\t"
+ "haddps %%xmm2, %%xmm2 \n\t"
+ "haddps %%xmm3, %%xmm3 \n\t"
+ "haddps %%xmm0, %%xmm0 \n\t"
+ "haddps %%xmm1, %%xmm1 \n\t"
+ "haddps %%xmm2, %%xmm2 \n\t"
+ "haddps %%xmm3, %%xmm3 \n\t"
+
+ "movss %%xmm0 , (%9) \n\t" // save temp2
+ "movss %%xmm1 , 4(%9) \n\t" // save temp2
+ "movss %%xmm2 , 8(%9) \n\t" // save temp2
+ "movss %%xmm3 , 12(%9) \n\t" // save temp2
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (a0), // 4
+ "r" (a1), // 5
+ "r" (a2), // 6
+ "r" (a3), // 7
+ "r" (temp1), // 8
+ "r" (temp2) // 9
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c
new file mode 100644
index 000000000..ca2f03dd0
--- /dev/null
+++ b/kernel/x86_64/zaxpy.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER)
+#include "zaxpy_microk_bulldozer-2.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_4
+
+static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG register i = 0;
+ BLASLONG register ix = 0;
+ FLOAT da_r = alpha[0];
+ FLOAT da_i = alpha[1];
+
+
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+ y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+ y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+ ix+=4 ;
+ i+=2 ;
+
+ }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ FLOAT da[2];
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ int n1 = n & -4;
+
+ if ( n1 )
+ {
+ da[0] = da_r;
+ da[1] = da_i;
+ zaxpy_kernel_4(n1, x, y , &da );
+ ix = 2 * n1;
+ }
+ i = n1;
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ i++ ;
+ ix += 2;
+
+ }
+ return(0);
+
+
+ }
+
+ inc_x *=2;
+ inc_y *=2;
+
+ while(i < n)
+ {
+
+#if !defined(CONJ)
+ y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c
new file mode 100644
index 000000000..780109b69
--- /dev/null
+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4 1
+static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vmovddup (%4), %%xmm0 \n\t" // real part of alpha
+ "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "prefetcht0 768(%2,%0,8) \n\t"
+ "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x
+ "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x
+ "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x
+ "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x
+ "prefetcht0 768(%3,%0,8) \n\t"
+
+#if !defined(CONJ)
+ "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t"
+ "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part
+ "vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t"
+
+ "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t"
+ "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part
+ "vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t"
+
+ "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t"
+ "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part
+ "vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t"
+
+ "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t"
+ "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part
+ "vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t"
+
+ "vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t"
+ "vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t"
+ "vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t"
+ "vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t"
+
+#else
+
+ "vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i
+ "vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i
+ "vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i
+ "vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i
+ "vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i
+ "vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i
+ "vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i
+ "vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i
+
+ "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
+ "vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t"
+ "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part
+
+ "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
+ "vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t"
+ "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part
+
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
+ "vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part
+
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
+ "vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part
+
+ "vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t"
+ "vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t"
+ "vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t"
+ "vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t"
+
+
+#endif
+
+ "vmovups %%xmm12, (%3,%0,8) \n\t"
+ "vmovups %%xmm13, 16(%3,%0,8) \n\t"
+ "vmovups %%xmm14, 32(%3,%0,8) \n\t"
+ "vmovups %%xmm15, 48(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (alpha) // 4
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
index e23e09ecc..f91bfa89b 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
@@ -222,8 +222,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 )
VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 )
- addq $6*SIZE, BO
- addq $8*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 8*SIZE, AO
decq %rax
.endm
@@ -362,8 +362,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 )
- addq $6*SIZE, BO
- addq $4*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 4*SIZE, AO
decq %rax
.endm
@@ -491,8 +491,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 )
VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
- addq $6*SIZE, BO
- addq $2*SIZE, AO
+ addq $ 6*SIZE, BO
+ addq $ 2*SIZE, AO
decq %rax
.endm
diff --git a/kernel/x86_64/zgemv_n.c b/kernel/x86_64/zgemv_n.c
deleted file mode 100644
index 9098368a5..000000000
--- a/kernel/x86_64/zgemv_n.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "common.h"
-
-#if defined(HASWELL)
-#include "zgemv_n_microk_haswell-2.c"
-#elif defined(SANDYBRIDGE)
-#include "zgemv_n_microk_sandy-2.c"
-#endif
-
-
-
-#define NBMAX 1024
-
-#ifndef HAVE_KERNEL_16x4
-
-static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0,*a1,*a2,*a3;
- a0 = ap[0];
- a1 = ap[1];
- a2 = ap[2];
- a3 = ap[3];
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- y[i] += a0[i]*x[0] - a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
- y[i] += a1[i]*x[2] - a1[i+1] * x[3];
- y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
- y[i] += a2[i]*x[4] - a2[i+1] * x[5];
- y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
- y[i] += a3[i]*x[6] - a3[i+1] * x[7];
- y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
-#else
- y[i] += a0[i]*x[0] + a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
- y[i] += a1[i]*x[2] + a1[i+1] * x[3];
- y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
- y[i] += a2[i]*x[4] + a2[i+1] * x[5];
- y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
- y[i] += a3[i]*x[6] + a3[i+1] * x[7];
- y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
-#endif
- }
-}
-
-#endif
-
-static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0;
- a0 = ap;
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- y[i] += a0[i]*x[0] - a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
-#else
- y[i] += a0[i]*x[0] + a0[i+1] * x[1];
- y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
-#endif
-
- }
-}
-
-
-static void zero_y(BLASLONG n, FLOAT *dest)
-{
- BLASLONG i;
- for ( i=0; i<2*n; i++ )
- {
- *dest = 0.0;
- dest++;
- }
-}
-
-
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
-{
- BLASLONG i;
- FLOAT temp_r;
- FLOAT temp_i;
- for ( i=0; i<n; i++ )
- {
-#if !defined(XCONJ)
- temp_r = alpha_r * src[0] - alpha_i * src[1];
- temp_i = alpha_r * src[1] + alpha_i * src[0];
-#else
- temp_r = alpha_r * src[0] + alpha_i * src[1];
- temp_i = -alpha_r * src[1] + alpha_i * src[0];
-#endif
-
- *dest += temp_r;
- *(dest+1) += temp_i;
-
- src+=2;
- dest += inc_dest;
- }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- FLOAT *ap[4];
- BLASLONG n1;
- BLASLONG m1;
- BLASLONG m2;
- BLASLONG n2;
- FLOAT xbuffer[8],*ybuffer;
-
-
-#if 0
-printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
-#endif
-
- if ( m < 1 ) return(0);
- if ( n < 1 ) return(0);
-
- ybuffer = buffer;
-
- inc_x *= 2;
- inc_y *= 2;
- lda *= 2;
-
- n1 = n / 4 ;
- n2 = n % 4 ;
-
- m1 = m - ( m % 16 );
- m2 = (m % NBMAX) - (m % 16) ;
-
- y_ptr = y;
-
- BLASLONG NB = NBMAX;
-
- while ( NB == NBMAX )
- {
-
- m1 -= NB;
- if ( m1 < 0)
- {
- if ( m2 == 0 ) break;
- NB = m2;
- }
-
- a_ptr = a;
- x_ptr = x;
- zero_y(NB,ybuffer);
- for( i = 0; i < n1 ; i++)
- {
-
- xbuffer[0] = x_ptr[0];
- xbuffer[1] = x_ptr[1];
- x_ptr += inc_x;
- xbuffer[2] = x_ptr[0];
- xbuffer[3] = x_ptr[1];
- x_ptr += inc_x;
- xbuffer[4] = x_ptr[0];
- xbuffer[5] = x_ptr[1];
- x_ptr += inc_x;
- xbuffer[6] = x_ptr[0];
- xbuffer[7] = x_ptr[1];
- x_ptr += inc_x;
-
- ap[0] = a_ptr;
- ap[1] = a_ptr + lda;
- ap[2] = ap[1] + lda;
- ap[3] = ap[2] + lda;
- zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
- a_ptr += 4 * lda;
- }
-
- for( i = 0; i < n2 ; i++)
- {
- xbuffer[0] = x_ptr[0];
- xbuffer[1] = x_ptr[1];
- x_ptr += inc_x;
- zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
- a_ptr += 1 * lda;
-
- }
- add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
- a += 2 * NB;
- y_ptr += NB * inc_y;
- }
-
- j=0;
- while ( j < (m % 16))
- {
- a_ptr = a;
- x_ptr = x;
- FLOAT temp_r = 0.0;
- FLOAT temp_i = 0.0;
- for( i = 0; i < n; i++ )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
- temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-#else
- temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
- temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-#endif
-
- a_ptr += lda;
- x_ptr += inc_x;
- }
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
- y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
- y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
- y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
- y_ptr += inc_y;
- a+=2;
- j++;
- }
- return(0);
-}
-
-
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
new file mode 100644
index 000000000..5ace6123b
--- /dev/null
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -0,0 +1,626 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h"
+
+
+#if defined(HASWELL)
+#include "zgemv_n_microk_haswell-4.c"
+#elif defined(SANDYBRIDGE)
+#include "zgemv_n_microk_sandy-4.c"
+#endif
+
+
+#define NBMAX 1024
+
+#ifndef HAVE_KERNEL_4x4
+
+static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ y[i] += a0[i]*x[0] - a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] - a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
+ y[i] += a2[i]*x[4] - a2[i+1] * x[5];
+ y[i+1] += a2[i]*x[5] + a2[i+1] * x[4];
+ y[i] += a3[i]*x[6] - a3[i+1] * x[7];
+ y[i+1] += a3[i]*x[7] + a3[i+1] * x[6];
+#else
+ y[i] += a0[i]*x[0] + a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] + a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
+ y[i] += a2[i]*x[4] + a2[i+1] * x[5];
+ y[i+1] += a2[i]*x[5] - a2[i+1] * x[4];
+ y[i] += a3[i]*x[6] + a3[i+1] * x[7];
+ y[i+1] += a3[i]*x[7] - a3[i+1] * x[6];
+#endif
+ }
+}
+
+#endif
+
+
+
+#ifndef HAVE_KERNEL_4x2
+
+static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1;
+ a0 = ap[0];
+ a1 = ap[1];
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ y[i] += a0[i]*x[0] - a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] - a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] + a1[i+1] * x[2];
+#else
+ y[i] += a0[i]*x[0] + a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+ y[i] += a1[i]*x[2] + a1[i+1] * x[3];
+ y[i+1] += a1[i]*x[3] - a1[i+1] * x[2];
+#endif
+ }
+}
+
+#endif
+
+
+
+
+#ifndef HAVE_KERNEL_4x1
+
+
+static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+ BLASLONG i;
+ FLOAT *a0;
+ a0 = ap;
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ y[i] += a0[i]*x[0] - a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] + a0[i+1] * x[0];
+#else
+ y[i] += a0[i]*x[0] + a0[i+1] * x[1];
+ y[i+1] += a0[i]*x[1] - a0[i+1] * x[0];
+#endif
+
+ }
+}
+
+
+#endif
+
+
+#ifndef HAVE_KERNEL_ADDY
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+ BLASLONG i;
+
+ if ( inc_dest != 2 )
+ {
+
+ FLOAT temp_r;
+ FLOAT temp_i;
+ for ( i=0; i<n; i++ )
+ {
+#if !defined(XCONJ)
+ temp_r = alpha_r * src[0] - alpha_i * src[1];
+ temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+ temp_r = alpha_r * src[0] + alpha_i * src[1];
+ temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+ *dest += temp_r;
+ *(dest+1) += temp_i;
+
+ src+=2;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ FLOAT temp_r0;
+ FLOAT temp_i0;
+ FLOAT temp_r1;
+ FLOAT temp_i1;
+ FLOAT temp_r2;
+ FLOAT temp_i2;
+ FLOAT temp_r3;
+ FLOAT temp_i3;
+ for ( i=0; i<n; i+=4 )
+ {
+#if !defined(XCONJ)
+ temp_r0 = alpha_r * src[0] - alpha_i * src[1];
+ temp_i0 = alpha_r * src[1] + alpha_i * src[0];
+ temp_r1 = alpha_r * src[2] - alpha_i * src[3];
+ temp_i1 = alpha_r * src[3] + alpha_i * src[2];
+ temp_r2 = alpha_r * src[4] - alpha_i * src[5];
+ temp_i2 = alpha_r * src[5] + alpha_i * src[4];
+ temp_r3 = alpha_r * src[6] - alpha_i * src[7];
+ temp_i3 = alpha_r * src[7] + alpha_i * src[6];
+#else
+ temp_r0 = alpha_r * src[0] + alpha_i * src[1];
+ temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
+ temp_r1 = alpha_r * src[2] + alpha_i * src[3];
+ temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
+ temp_r2 = alpha_r * src[4] + alpha_i * src[5];
+ temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
+ temp_r3 = alpha_r * src[6] + alpha_i * src[7];
+ temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
+#endif
+
+ dest[0] += temp_r0;
+ dest[1] += temp_i0;
+ dest[2] += temp_r1;
+ dest[3] += temp_i1;
+ dest[4] += temp_r2;
+ dest[5] += temp_i2;
+ dest[6] += temp_r3;
+ dest[7] += temp_i3;
+
+ src += 8;
+ dest += 8;
+ }
+ return;
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ FLOAT *ap[4];
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ BLASLONG lda4;
+ FLOAT xbuffer[8],*ybuffer;
+
+
+#if 0
+printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
+#endif
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ ybuffer = buffer;
+
+ inc_x *= 2;
+ inc_y *= 2;
+ lda *= 2;
+ lda4 = 4 * lda;
+
+ n1 = n / 4 ;
+ n2 = n % 4 ;
+
+ m3 = m % 4;
+ m1 = m - ( m % 4 );
+ m2 = (m % NBMAX) - (m % 4) ;
+
+ y_ptr = y;
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ a_ptr = a;
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+ x_ptr = x;
+ //zero_y(NB,ybuffer);
+ memset(ybuffer,0,NB*16);
+
+ if ( inc_x == 2 )
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ zgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ x_ptr += 8;
+ }
+
+ if ( n2 & 2 )
+ {
+ zgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
+ x_ptr += 4;
+ a_ptr += 2 * lda;
+
+ }
+
+ if ( n2 & 1 )
+ {
+ zgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
+ x_ptr += 2;
+ a_ptr += lda;
+
+ }
+ }
+ else
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+
+ xbuffer[0] = x_ptr[0];
+ xbuffer[1] = x_ptr[1];
+ x_ptr += inc_x;
+ xbuffer[2] = x_ptr[0];
+ xbuffer[3] = x_ptr[1];
+ x_ptr += inc_x;
+ xbuffer[4] = x_ptr[0];
+ xbuffer[5] = x_ptr[1];
+ x_ptr += inc_x;
+ xbuffer[6] = x_ptr[0];
+ xbuffer[7] = x_ptr[1];
+ x_ptr += inc_x;
+
+ zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ }
+
+ for( i = 0; i < n2 ; i++)
+ {
+ xbuffer[0] = x_ptr[0];
+ xbuffer[1] = x_ptr[1];
+ x_ptr += inc_x;
+ zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
+ a_ptr += 1 * lda;
+
+ }
+
+ }
+
+ add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
+ a += 2 * NB;
+ y_ptr += NB * inc_y;
+ }
+
+ if ( m3 == 0 ) return(0);
+
+ if ( m3 == 1 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp_r = 0.0;
+ FLOAT temp_i = 0.0;
+
+ if ( lda == 2 && inc_x == 2 )
+ {
+
+
+ for( i=0 ; i < (n & -2); i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+ temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+ temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+ a_ptr += 4;
+ x_ptr += 4;
+ }
+
+
+
+ for( ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+ a_ptr += 2;
+ x_ptr += 2;
+ }
+
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+ }
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+ return(0);
+ }
+
+ if ( m3 == 2 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_i1 = 0.0;
+
+ if ( lda == 4 && inc_x == 2 )
+ {
+
+ for( i = 0; i < (n & -2); i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+ temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+ temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+ temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+ temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+ temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+ temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+ temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+ temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+
+#endif
+
+ a_ptr += 8;
+ x_ptr += 4;
+ }
+
+
+ for( ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+ a_ptr += 4;
+ x_ptr += 2;
+ }
+
+
+ }
+ else
+ {
+
+ for( i=0 ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+
+ }
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+ y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+ return(0);
+ }
+
+
+ if ( m3 == 3 )
+ {
+ a_ptr = a;
+ x_ptr = x;
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_i1 = 0.0;
+ FLOAT temp_r2 = 0.0;
+ FLOAT temp_i2 = 0.0;
+
+ if ( lda == 6 && inc_x == 2 )
+ {
+
+ for( i=0 ; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+ a_ptr += 6;
+ x_ptr += 2;
+ }
+
+
+ }
+ else
+ {
+
+ for( i = 0; i < n; i++ )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+ temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+ temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+ a_ptr += lda;
+ x_ptr += inc_x;
+ }
+
+ }
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+ y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+ y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+ y_ptr += inc_y;
+ y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+ y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+ return(0);
+ }
+
+
+
+
+
+ return(0);
+}
+
+
diff --git a/kernel/x86_64/zgemv_n_microk_haswell-2.c b/kernel/x86_64/zgemv_n_microk_haswell-2.c
deleted file mode 100644
index e1c5838f7..000000000
--- a/kernel/x86_64/zgemv_n_microk_haswell-2.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
-
- "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
- "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
- "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
- "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
- "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
- "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
- "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
- "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
-
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
- "prefetcht0 192(%4,%0,8) \n\t"
- "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
- "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
-
- "prefetcht0 192(%5,%0,8) \n\t"
- "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
- "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
-
- "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "prefetcht0 192(%6,%0,8) \n\t"
- "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
- "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2
-
- "vfmadd231pd %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vfmadd231pd %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vfmadd231pd %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vfmadd231pd %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "prefetcht0 192(%7,%0,8) \n\t"
- "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3
- "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3
-
- "vfmadd231pd %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vfmadd231pd %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vfmadd231pd %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vfmadd231pd %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "vfmadd231pd %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
- "vfmadd231pd %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
- "vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
- "vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
-
- "prefetcht0 192(%3,%0,8) \n\t"
- "vmovups (%3,%0,8), %%ymm10 \n\t"
- "vmovups 32(%3,%0,8), %%ymm11 \n\t"
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
- "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
- "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
- "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
-#else
- "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
- "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
- "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
- "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
- "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
- "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
-#endif
-
- "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
- "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
-
- "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
- "vmovups %%ymm13, 32(%3,%0,8) \n\t"
-
- "addq $8 , %0 \n\t"
- "subq $4 , %1 \n\t"
- "jnz .L01LOOP%= \n\t"
- "vzeroupper \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/zgemv_n_microk_haswell-4.c b/kernel/x86_64/zgemv_n_microk_haswell-4.c
new file mode 100644
index 000000000..61358508a
--- /dev/null
+++ b/kernel/x86_64/zgemv_n_microk_haswell-4.c
@@ -0,0 +1,400 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
+ "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
+ "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
+ "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
+ "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "prefetcht0 192(%5,%0,8) \n\t"
+ "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
+ "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 192(%6,%0,8) \n\t"
+ "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2
+ "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2
+
+ "vfmadd231pd %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231pd %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231pd %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231pd %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 192(%7,%0,8) \n\t"
+ "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3
+ "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3
+
+ "vfmadd231pd %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231pd %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231pd %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231pd %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmadd231pd %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231pd %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231pd %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231pd %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 192(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]) // 7
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+#define HAVE_KERNEL_4x2 1
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "prefetcht0 192(%5,%0,8) \n\t"
+ "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1
+ "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "vfmadd231pd %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vfmadd231pd %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vfmadd231pd %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vfmadd231pd %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+
+ "prefetcht0 192(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x1 1
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 192(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap) // 4
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+
+#define HAVE_KERNEL_ADDY 1
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+ BLASLONG i;
+
+ if ( inc_dest != 2 )
+ {
+
+ FLOAT temp_r;
+ FLOAT temp_i;
+ for ( i=0; i<n; i++ )
+ {
+#if !defined(XCONJ)
+ temp_r = alpha_r * src[0] - alpha_i * src[1];
+ temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+ temp_r = alpha_r * src[0] + alpha_i * src[1];
+ temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+ *dest += temp_r;
+ *(dest+1) += temp_i;
+
+ src+=2;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%4), %%ymm0 \n\t" // alpha_r
+ "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ "prefetcht0 192(%2,%0,8) \n\t"
+ "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
+ "vmovups 32(%2,%0,8), %%ymm9 \n\t"
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ "prefetcht0 192(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t" // 2 complex values from dest
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if !defined(XCONJ)
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (src), // 2
+ "r" (dest), // 3
+ "r" (&alpha_r), // 4
+ "r" (&alpha_i) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+ return;
+
+}
+
diff --git a/kernel/x86_64/zgemv_n_microk_sandy-2.c b/kernel/x86_64/zgemv_n_microk_sandy-2.c
deleted file mode 100644
index 352c60f87..000000000
--- a/kernel/x86_64/zgemv_n_microk_sandy-2.c
+++ /dev/null
@@ -1,149 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
-
- "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
- "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
- "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
- "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
- "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
- "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
- "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
- "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
-
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
-
- "prefetcht0 256(%4,%0,8) \n\t"
- "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
- "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
-
- "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
- "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
- "prefetcht0 256(%5,%0,8) \n\t"
- "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
- "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0
- "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
- "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0
-
- "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
- "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
- "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
- "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
- "prefetcht0 256(%6,%0,8) \n\t"
- "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
- "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
- "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0
- "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t"
- "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
-
- "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0
-
- "vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t"
- "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
- "vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t"
- "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
- "prefetcht0 256(%7,%0,8) \n\t"
- "vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t"
- "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
- "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0
- "vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t"
- "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
-
- "vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0
-
- "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t"
- "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
- "vmulpd %%ymm8 , %%ymm7 , %%ymm11 \n\t"
- "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
- "vmulpd %%ymm9 , %%ymm6 , %%ymm10 \n\t"
- "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
- "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t"
- "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
-
- "prefetcht0 256(%3,%0,8) \n\t"
- "vmovups (%3,%0,8), %%ymm10 \n\t"
- "vmovups 32(%3,%0,8), %%ymm11 \n\t"
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
- "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
- "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
- "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
-#else
- "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
- "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
- "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
- "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
- "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
- "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
-#endif
-
- "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
- "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
-
- "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
- "vmovups %%ymm13, 32(%3,%0,8) \n\t"
-
- "addq $8 , %0 \n\t"
- "subq $4 , %1 \n\t"
- "jnz .L01LOOP%= \n\t"
- "vzeroupper \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/zgemv_n_microk_sandy-4.c b/kernel/x86_64/zgemv_n_microk_sandy-4.c
new file mode 100644
index 000000000..009e4801e
--- /dev/null
+++ b/kernel/x86_64/zgemv_n_microk_sandy-4.c
@@ -0,0 +1,417 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
+ "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2
+ "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2
+ "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3
+ "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3
+
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ //"prefetcht0 256(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
+ "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
+ //"prefetcht0 256(%5,%0,8) \n\t"
+ "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
+ "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
+ "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
+ "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
+ "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
+ //"prefetcht0 256(%6,%0,8) \n\t"
+ "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
+ "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
+ "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t"
+
+ "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+ "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
+
+ "vmulpd %%ymm8 , %%ymm4 , %%ymm10 \n\t"
+ "vmulpd %%ymm8 , %%ymm5 , %%ymm11 \n\t"
+ "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
+ // "prefetcht0 256(%7,%0,8) \n\t"
+ "vmulpd %%ymm9 , %%ymm4 , %%ymm10 \n\t"
+ "vmovups (%7,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
+ "vmulpd %%ymm9 , %%ymm5 , %%ymm11 \n\t"
+
+ "vmovups 32(%7,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+ "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
+
+ "vmulpd %%ymm8 , %%ymm6 , %%ymm10 \n\t"
+ "vmulpd %%ymm8 , %%ymm7 , %%ymm11 \n\t"
+ "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
+ "vmulpd %%ymm9 , %%ymm6 , %%ymm10 \n\t"
+ "vmulpd %%ymm9 , %%ymm7 , %%ymm11 \n\t"
+ "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
+ "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
+
+ // "prefetcht0 256(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]) // 7
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x2 1
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+ "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1
+ "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ // "prefetcht0 256(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+ "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
+ "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
+
+ // "prefetcht0 256(%5,%0,8) \n\t"
+ "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
+ "vmovups (%5,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
+ "vmovups 32(%5,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+
+ "vmulpd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
+ "vmulpd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
+ "vaddpd %%ymm12, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm13, %%ymm11, %%ymm13 \n\t"
+
+ "vmulpd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
+ "vmulpd %%ymm9 , %%ymm3 , %%ymm11 \n\t"
+ "vaddpd %%ymm14, %%ymm10, %%ymm14 \n\t"
+ "vaddpd %%ymm15, %%ymm11, %%ymm15 \n\t"
+
+
+ // "prefetcht0 256(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x1 1
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0
+ "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ // "prefetcht0 256(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
+ "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
+ "vmulpd %%ymm8 , %%ymm0 , %%ymm12 \n\t"
+ "vmulpd %%ymm8 , %%ymm1 , %%ymm13 \n\t"
+
+ "vmulpd %%ymm9 , %%ymm0 , %%ymm14 \n\t"
+ "vmulpd %%ymm9 , %%ymm1 , %%ymm15 \n\t"
+
+ // "prefetcht0 256(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t"
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap) // 4
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+#define HAVE_KERNEL_ADDY 1
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+ BLASLONG i;
+
+ if ( inc_dest != 2 )
+ {
+
+ FLOAT temp_r;
+ FLOAT temp_i;
+ for ( i=0; i<n; i++ )
+ {
+#if !defined(XCONJ)
+ temp_r = alpha_r * src[0] - alpha_i * src[1];
+ temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+ temp_r = alpha_r * src[0] + alpha_i * src[1];
+ temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+ *dest += temp_r;
+ *(dest+1) += temp_i;
+
+ src+=2;
+ dest += inc_dest;
+ }
+ return;
+ }
+
+ i=0;
+
+ __asm__ __volatile__
+ (
+
+ "vzeroupper \n\t"
+
+ "vbroadcastsd (%4), %%ymm0 \n\t" // alpha_r
+ "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+ // "prefetcht0 192(%2,%0,8) \n\t"
+ "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
+ "vmovups 32(%2,%0,8), %%ymm9 \n\t"
+
+ "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+ "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+ "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+ "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+ // "prefetcht0 192(%3,%0,8) \n\t"
+ "vmovups (%3,%0,8), %%ymm10 \n\t" // 2 complex values from dest
+ "vmovups 32(%3,%0,8), %%ymm11 \n\t"
+
+#if !defined(XCONJ)
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+#endif
+
+ "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t"
+ "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t"
+
+ "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y
+ "vmovups %%ymm13, 32(%3,%0,8) \n\t"
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (src), // 2
+ "r" (dest), // 3
+ "r" (&alpha_r), // 4
+ "r" (&alpha_i) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+ return;
+
+}
+
diff --git a/kernel/x86_64/zgemv_t.c b/kernel/x86_64/zgemv_t.c
deleted file mode 100644
index 9f5444a72..000000000
--- a/kernel/x86_64/zgemv_t.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-#include "common.h"
-
-
-#if defined(BULLDOZER) || defined(PILEDRIVER)
-#include "zgemv_t_microk_bulldozer-2.c"
-#elif defined(HASWELL)
-#include "zgemv_t_microk_haswell-2.c"
-#endif
-
-
-#define NBMAX 1028
-
-#ifndef HAVE_KERNEL_16x4
-
-static void zgemv_kernel_16x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0,*a1,*a2,*a3;
- a0 = ap[0];
- a1 = ap[1];
- a2 = ap[2];
- a3 = ap[3];
- FLOAT temp_r0 = 0.0;
- FLOAT temp_r1 = 0.0;
- FLOAT temp_r2 = 0.0;
- FLOAT temp_r3 = 0.0;
- FLOAT temp_i0 = 0.0;
- FLOAT temp_i1 = 0.0;
- FLOAT temp_i2 = 0.0;
- FLOAT temp_i3 = 0.0;
-
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
- temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
- temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
- temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
- temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1];
- temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];
- temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1];
- temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];
-#else
- temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
- temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
- temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
- temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
- temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1];
- temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];
- temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1];
- temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];
-#endif
- }
- y[0] = temp_r0;
- y[1] = temp_i0;
- y[2] = temp_r1;
- y[3] = temp_i1;
- y[4] = temp_r2;
- y[5] = temp_i2;
- y[6] = temp_r3;
- y[7] = temp_i3;
-}
-
-#endif
-
-static void zgemv_kernel_16x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
-{
- BLASLONG i;
- FLOAT *a0;
- a0 = ap;
- FLOAT temp_r = 0.0;
- FLOAT temp_i = 0.0;
-
- for ( i=0; i< 2*n; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r += a0[i]*x[i] - a0[i+1]*x[i+1];
- temp_i += a0[i]*x[i+1] + a0[i+1]*x[i];
-#else
- temp_r += a0[i]*x[i] + a0[i+1]*x[i+1];
- temp_i += a0[i]*x[i+1] - a0[i+1]*x[i];
-#endif
- }
- *y = temp_r;
- *(y+1) = temp_i;
-}
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
-{
- BLASLONG i;
- for ( i=0; i<n; i++ )
- {
- *dest = *src;
- *(dest+1) = *(src+1);
- dest+=2;
- src += inc_src;
- }
-}
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
- BLASLONG i;
- BLASLONG j;
- FLOAT *a_ptr;
- FLOAT *x_ptr;
- FLOAT *y_ptr;
- FLOAT *ap[8];
- BLASLONG n1;
- BLASLONG m1;
- BLASLONG m2;
- BLASLONG n2;
- FLOAT ybuffer[8],*xbuffer;
-
- if ( m < 1 ) return(0);
- if ( n < 1 ) return(0);
-
- inc_x *= 2;
- inc_y *= 2;
- lda *= 2;
-
- xbuffer = buffer;
-
- n1 = n / 4 ;
- n2 = n % 4 ;
-
- m1 = m - ( m % 16 );
- m2 = (m % NBMAX) - (m % 16) ;
-
-
- BLASLONG NB = NBMAX;
-
- while ( NB == NBMAX )
- {
-
- m1 -= NB;
- if ( m1 < 0)
- {
- if ( m2 == 0 ) break;
- NB = m2;
- }
-
- y_ptr = y;
- a_ptr = a;
- x_ptr = x;
- copy_x(NB,x_ptr,xbuffer,inc_x);
- for( i = 0; i < n1 ; i++)
- {
- ap[0] = a_ptr;
- ap[1] = a_ptr + lda;
- ap[2] = ap[1] + lda;
- ap[3] = ap[2] + lda;
- zgemv_kernel_16x4(NB,ap,xbuffer,ybuffer);
- a_ptr += 4 * lda;
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
- y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[2] - alpha_i * ybuffer[3];
- y_ptr[1] += alpha_r * ybuffer[3] + alpha_i * ybuffer[2];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[4] - alpha_i * ybuffer[5];
- y_ptr[1] += alpha_r * ybuffer[5] + alpha_i * ybuffer[4];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[6] - alpha_i * ybuffer[7];
- y_ptr[1] += alpha_r * ybuffer[7] + alpha_i * ybuffer[6];
- y_ptr += inc_y;
-#else
- y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
- y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[2] + alpha_i * ybuffer[3];
- y_ptr[1] -= alpha_r * ybuffer[3] - alpha_i * ybuffer[2];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[4] + alpha_i * ybuffer[5];
- y_ptr[1] -= alpha_r * ybuffer[5] - alpha_i * ybuffer[4];
- y_ptr += inc_y;
- y_ptr[0] += alpha_r * ybuffer[6] + alpha_i * ybuffer[7];
- y_ptr[1] -= alpha_r * ybuffer[7] - alpha_i * ybuffer[6];
- y_ptr += inc_y;
-#endif
- }
-
- for( i = 0; i < n2 ; i++)
- {
- zgemv_kernel_16x1(NB,a_ptr,xbuffer,ybuffer);
- a_ptr += 1 * lda;
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * ybuffer[0] - alpha_i * ybuffer[1];
- y_ptr[1] += alpha_r * ybuffer[1] + alpha_i * ybuffer[0];
- y_ptr += inc_y;
-#else
- y_ptr[0] += alpha_r * ybuffer[0] + alpha_i * ybuffer[1];
- y_ptr[1] -= alpha_r * ybuffer[1] - alpha_i * ybuffer[0];
- y_ptr += inc_y;
-#endif
-
- }
- a += 2* NB;
- x += NB * inc_x;
- }
-
- BLASLONG m3 = m % 16;
- if ( m3 == 0 ) return(0);
-
- x_ptr = x;
- copy_x(m3,x_ptr,xbuffer,inc_x);
- j=0;
- a_ptr = a;
- y_ptr = y;
- while ( j < n)
- {
- FLOAT temp_r = 0.0;
- FLOAT temp_i = 0.0;
- for( i = 0; i < m3*2; i+=2 )
- {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- temp_r += a_ptr[i] * xbuffer[i] - a_ptr[i+1] * xbuffer[i+1];
- temp_i += a_ptr[i] * xbuffer[i+1] + a_ptr[i+1] * xbuffer[i];
-#else
- temp_r += a_ptr[i] * xbuffer[i] + a_ptr[i+1] * xbuffer[i+1];
- temp_i += a_ptr[i] * xbuffer[i+1] - a_ptr[i+1] * xbuffer[i];
-#endif
- }
- a_ptr += lda;
-
-#if !defined(XCONJ)
- y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
- y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
- y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
- y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
- y_ptr += inc_y;
- j++;
- }
- return(0);
-}
-
-
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
new file mode 100644
index 000000000..84cf4e2e8
--- /dev/null
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -0,0 +1,583 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+#if defined(BULLDOZER) || defined(PILEDRIVER)
+#include "zgemv_t_microk_bulldozer-4.c"
+#elif defined(HASWELL)
+#include "zgemv_t_microk_haswell-4.c"
+#endif
+
+
+#define NBMAX 1024
+
+#ifndef HAVE_KERNEL_4x4
+
+static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1,*a2,*a3;
+ a0 = ap[0];
+ a1 = ap[1];
+ a2 = ap[2];
+ a3 = ap[3];
+ FLOAT alpha_r = alpha[0];
+ FLOAT alpha_i = alpha[1];
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_r2 = 0.0;
+ FLOAT temp_r3 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_i1 = 0.0;
+ FLOAT temp_i2 = 0.0;
+ FLOAT temp_i3 = 0.0;
+
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
+ temp_r2 += a2[i]*x[i] - a2[i+1]*x[i+1];
+ temp_i2 += a2[i]*x[i+1] + a2[i+1]*x[i];
+ temp_r3 += a3[i]*x[i] - a3[i+1]*x[i+1];
+ temp_i3 += a3[i]*x[i+1] + a3[i+1]*x[i];
+#else
+ temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
+ temp_r2 += a2[i]*x[i] + a2[i+1]*x[i+1];
+ temp_i2 += a2[i]*x[i+1] - a2[i+1]*x[i];
+ temp_r3 += a3[i]*x[i] + a3[i+1]*x[i+1];
+ temp_i3 += a3[i]*x[i+1] - a3[i+1]*x[i];
+#endif
+ }
+
+#if !defined(XCONJ)
+
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+ y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+ y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+ y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+ y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+ y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+ y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+ y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+ y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+
+#endif
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0,*a1;
+ a0 = ap[0];
+ a1 = ap[1];
+ FLOAT alpha_r = alpha[0];
+ FLOAT alpha_i = alpha[1];
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_r1 = 0.0;
+ FLOAT temp_i0 = 0.0;
+ FLOAT temp_i1 = 0.0;
+
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] - a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] + a1[i+1]*x[i];
+#else
+ temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
+ temp_r1 += a1[i]*x[i] + a1[i+1]*x[i+1];
+ temp_i1 += a1[i]*x[i+1] - a1[i+1]*x[i];
+#endif
+ }
+
+#if !defined(XCONJ)
+
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+ y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+
+#else
+
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+ y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+ y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+
+#endif
+}
+
+#endif
+
+
+#ifndef HAVE_KERNEL_4x1
+
+static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+ BLASLONG i;
+ FLOAT *a0;
+ a0 = ap;
+ FLOAT alpha_r = alpha[0];
+ FLOAT alpha_i = alpha[1];
+ FLOAT temp_r0 = 0.0;
+ FLOAT temp_i0 = 0.0;
+
+ for ( i=0; i< 2*n; i+=2 )
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r0 += a0[i]*x[i] - a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] + a0[i+1]*x[i];
+#else
+ temp_r0 += a0[i]*x[i] + a0[i+1]*x[i+1];
+ temp_i0 += a0[i]*x[i+1] - a0[i+1]*x[i];
+#endif
+ }
+
+#if !defined(XCONJ)
+
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+
+#else
+
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+
+#endif
+
+
+}
+
+#endif
+
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
+{
+ BLASLONG i;
+ for ( i=0; i<n; i++ )
+ {
+ *dest = *src;
+ *(dest+1) = *(src+1);
+ dest+=2;
+ src += inc_src;
+ }
+}
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+ BLASLONG i;
+ BLASLONG j;
+ FLOAT *a_ptr;
+ FLOAT *x_ptr;
+ FLOAT *y_ptr;
+ FLOAT *ap[8];
+ BLASLONG n1;
+ BLASLONG m1;
+ BLASLONG m2;
+ BLASLONG m3;
+ BLASLONG n2;
+ BLASLONG lda4;
+ FLOAT ybuffer[8],*xbuffer;
+ FLOAT alpha[2];
+
+ if ( m < 1 ) return(0);
+ if ( n < 1 ) return(0);
+
+ inc_x <<= 1;
+ inc_y <<= 1;
+ lda <<= 1;
+ lda4 = lda << 2;
+
+ xbuffer = buffer;
+
+ n1 = n >> 2 ;
+ n2 = n & 3 ;
+
+ m3 = m & 3 ;
+ m1 = m - m3;
+ m2 = (m & (NBMAX-1)) - m3 ;
+
+ alpha[0] = alpha_r;
+ alpha[1] = alpha_i;
+
+ BLASLONG NB = NBMAX;
+
+ while ( NB == NBMAX )
+ {
+
+ m1 -= NB;
+ if ( m1 < 0)
+ {
+ if ( m2 == 0 ) break;
+ NB = m2;
+ }
+
+ y_ptr = y;
+ a_ptr = a;
+ x_ptr = x;
+ ap[0] = a_ptr;
+ ap[1] = a_ptr + lda;
+ ap[2] = ap[1] + lda;
+ ap[3] = ap[2] + lda;
+ if ( inc_x != 2 )
+ copy_x(NB,x_ptr,xbuffer,inc_x);
+ else
+ xbuffer = x_ptr;
+
+ if ( inc_y == 2 )
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+ y_ptr += 8;
+
+ }
+
+ if ( n2 & 2 )
+ {
+ zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha);
+ a_ptr += lda * 2;
+ y_ptr += 4;
+
+ }
+
+ if ( n2 & 1 )
+ {
+ zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha);
+ a_ptr += lda;
+ y_ptr += 2;
+
+ }
+
+ }
+ else
+ {
+
+ for( i = 0; i < n1 ; i++)
+ {
+ memset(ybuffer,0,64);
+ zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha);
+ ap[0] += lda4;
+ ap[1] += lda4;
+ ap[2] += lda4;
+ ap[3] += lda4;
+ a_ptr += lda4;
+
+ y_ptr[0] += ybuffer[0];
+ y_ptr[1] += ybuffer[1];
+ y_ptr += inc_y;
+ y_ptr[0] += ybuffer[2];
+ y_ptr[1] += ybuffer[3];
+ y_ptr += inc_y;
+ y_ptr[0] += ybuffer[4];
+ y_ptr[1] += ybuffer[5];
+ y_ptr += inc_y;
+ y_ptr[0] += ybuffer[6];
+ y_ptr[1] += ybuffer[7];
+ y_ptr += inc_y;
+
+ }
+
+ for( i = 0; i < n2 ; i++)
+ {
+ memset(ybuffer,0,64);
+ zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
+ a_ptr += lda;
+ y_ptr[0] += ybuffer[0];
+ y_ptr[1] += ybuffer[1];
+ y_ptr += inc_y;
+
+ }
+
+ }
+ a += 2 * NB;
+ x += NB * inc_x;
+ }
+
+
+
+ if ( m3 == 0 ) return(0);
+
+ x_ptr = x;
+ j=0;
+ a_ptr = a;
+ y_ptr = y;
+
+ if ( m3 == 3 )
+ {
+
+ FLOAT temp_r ;
+ FLOAT temp_i ;
+ FLOAT x0 = x_ptr[0];
+ FLOAT x1 = x_ptr[1];
+ x_ptr += inc_x;
+ FLOAT x2 = x_ptr[0];
+ FLOAT x3 = x_ptr[1];
+ x_ptr += inc_x;
+ FLOAT x4 = x_ptr[0];
+ FLOAT x5 = x_ptr[1];
+ while ( j < n)
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+ temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
+ temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+ temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
+ temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j++;
+ }
+ return(0);
+ }
+
+
+ if ( m3 == 2 )
+ {
+
+ FLOAT temp_r ;
+ FLOAT temp_i ;
+ FLOAT temp_r1 ;
+ FLOAT temp_i1 ;
+ FLOAT x0 = x_ptr[0];
+ FLOAT x1 = x_ptr[1];
+ x_ptr += inc_x;
+ FLOAT x2 = x_ptr[0];
+ FLOAT x3 = x_ptr[1];
+ FLOAT ar = alpha[0];
+ FLOAT ai = alpha[1];
+
+ while ( j < ( n & -2 ))
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+ y_ptr[1] += ar * temp_i1 + ai * temp_r1;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+ y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j+=2;
+ }
+
+
+ while ( j < n)
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j++;
+ }
+
+ return(0);
+ }
+
+
+ if ( m3 == 1 )
+ {
+
+ FLOAT temp_r ;
+ FLOAT temp_i ;
+ FLOAT temp_r1 ;
+ FLOAT temp_i1 ;
+ FLOAT x0 = x_ptr[0];
+ FLOAT x1 = x_ptr[1];
+ FLOAT ar = alpha[0];
+ FLOAT ai = alpha[1];
+
+ while ( j < ( n & -2 ))
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+ a_ptr += lda;
+ temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 - ai * temp_i1;
+ y_ptr[1] += ar * temp_i1 + ai * temp_r1;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+ y_ptr += inc_y;
+ y_ptr[0] += ar * temp_r1 + ai * temp_i1;
+ y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j+=2;
+ }
+
+ while ( j < n)
+ {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ)
+ y_ptr[0] += ar * temp_r - ai * temp_i;
+ y_ptr[1] += ar * temp_i + ai * temp_r;
+#else
+ y_ptr[0] += ar * temp_r + ai * temp_i;
+ y_ptr[1] -= ar * temp_i - ai * temp_r;
+#endif
+
+ a_ptr += lda;
+ y_ptr += inc_y;
+ j++;
+ }
+ return(0);
+ }
+
+ return(0);
+
+
+}
+
+
diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c b/kernel/x86_64/zgemv_t_microk_bulldozer-2.c
deleted file mode 100644
index 65d5a10a2..000000000
--- a/kernel/x86_64/zgemv_t_microk_bulldozer-2.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary froms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary from must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
-
- "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp
- "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp
- "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp
- "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp
- "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" // temp
- "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t"
- "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t"
- "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t"
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
-
- "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
- "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
-
- "prefetcht0 192(%4,%0,8) \n\t"
- "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
- "prefetcht0 192(%5,%0,8) \n\t"
- "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
- "prefetcht0 192(%6,%0,8) \n\t"
- "vmovups (%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
- "prefetcht0 192(%7,%0,8) \n\t"
- "vmovups (%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
-
- "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
-
- "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0
- "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0
-
- "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
- "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
- "vmovups 16(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
- "vmovups 16(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
-
- "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
-
- "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
- "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
-
- "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
- "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
- "vmovups 32(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
- "vmovups 32(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
-
- "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
-
- "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0
- "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0
-
- "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
- "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
- "vmovups 48(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
- "vmovups 48(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
-
- "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
- "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
- "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
-
- "addq $8 , %0 \n\t"
- "subq $4 , %1 \n\t"
- "jnz .L01LOOP%= \n\t"
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
- "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t"
- "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
- "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
- "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
- "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t"
- "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t"
- "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t"
-#else
- "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
- "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
- "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
- "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
- "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
- "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t"
- "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t"
- "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t"
- "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
- "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
- "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
- "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
-#endif
-
-
- "vmovups %%xmm8 , (%3) \n\t"
- "vmovups %%xmm10, 16(%3) \n\t"
- "vmovups %%xmm12, 32(%3) \n\t"
- "vmovups %%xmm14, 48(%3) \n\t"
-
- "vzeroupper \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c
new file mode 100644
index 000000000..006db226b
--- /dev/null
+++ b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c
@@ -0,0 +1,457 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary froms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary from must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp
+ "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp
+ "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp
+ "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp
+ "vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" // temp
+ "vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t"
+ "vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t"
+ "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "prefetcht0 192(%5,%0,8) \n\t"
+ "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+ "prefetcht0 192(%6,%0,8) \n\t"
+ "vmovups (%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
+ "prefetcht0 192(%7,%0,8) \n\t"
+ "vmovups (%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+ "vmovups 16(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
+ "vmovups 16(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+ "vmovups 32(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
+ "vmovups 32(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+ "vmovups 48(%6,%0,8), %%xmm6 \n\t" // 1 complex values from a2
+ "vmovups 48(%7,%0,8), %%xmm7 \n\t" // 1 complex values from a3
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm12, %%xmm6 , %%xmm0, %%xmm12 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm13, %%xmm6 , %%xmm1, %%xmm13 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm14, %%xmm7 , %%xmm0, %%xmm14 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm15, %%xmm7 , %%xmm1, %%xmm15 \n\t" // ar0*xl0,al0*xl0
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovddup (%8) , %%xmm0 \n\t" // value from alpha
+ "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t"
+ "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
+ "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t"
+ "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t"
+ "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t"
+ "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
+#endif
+
+ "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t"
+ "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
+ "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t"
+ "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t"
+ "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t"
+ "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
+#endif
+
+ "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t"
+ "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t"
+ "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t"
+ "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t"
+
+ "vmovups %%xmm8 , (%3) \n\t"
+ "vmovups %%xmm10, 16(%3) \n\t"
+ "vmovups %%xmm12, 32(%3) \n\t"
+ "vmovups %%xmm14, 48(%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+#define HAVE_KERNEL_4x2 1
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp
+ "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp
+ "vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" // temp
+ "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "prefetcht0 192(%5,%0,8) \n\t"
+ "vmovups (%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovddup 16(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 24(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "vmovups 16(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 16(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 32(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovddup 48(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 56(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "vmovups 48(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 48(%5,%0,8), %%xmm5 \n\t" // 1 complex values from a1
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+ "vfmaddpd %%xmm10, %%xmm5 , %%xmm0, %%xmm10 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm11, %%xmm5 , %%xmm1, %%xmm11 \n\t" // ar0*xl0,al0*xl0
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovddup (%6) , %%xmm0 \n\t" // value from alpha
+ "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+#endif
+
+ "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+#endif
+
+ "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t"
+ "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t"
+
+ "vmovups %%xmm8 , (%3) \n\t"
+ "vmovups %%xmm10, 16(%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (alpha) // 6
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+#define HAVE_KERNEL_4x1 1
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" // temp
+ "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 16(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0
+
+ "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x0
+ "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x0
+
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+
+ "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+
+ "vmovups 32(%4,%0,8), %%xmm4 \n\t" // 1 complex values from a0
+ "vmovups 48(%4,%0,8), %%xmm5 \n\t" // 1 complex values from a0
+
+ "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x0
+ "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x0
+
+ "addq $8 , %0 \n\t"
+ "vfmaddpd %%xmm8 , %%xmm4 , %%xmm0, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm4 , %%xmm1, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+
+ "subq $4 , %1 \n\t"
+ "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0
+ "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0
+
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovddup (%5) , %%xmm0 \n\t" // value from alpha
+ "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+#endif
+
+ "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+#endif
+
+ "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t"
+
+ "vmovups %%xmm8 , (%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap), // 4
+ "r" (alpha) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
diff --git a/kernel/x86_64/zgemv_t_microk_haswell-2.c b/kernel/x86_64/zgemv_t_microk_haswell-2.c
deleted file mode 100644
index 8325db5cf..000000000
--- a/kernel/x86_64/zgemv_t_microk_haswell-2.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary froms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary from must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_16x4 1
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zgemv_kernel_16x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
-{
-
- BLASLONG register i = 0;
-
- __asm__ __volatile__
- (
- "vzeroupper \n\t"
-
- "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
- "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
- "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
- "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
- "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
- "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
- "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
- "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
-
- ".align 16 \n\t"
- ".L01LOOP%=: \n\t"
-
- "prefetcht0 192(%2,%0,8) \n\t"
- "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
- "prefetcht0 192(%4,%0,8) \n\t"
- "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1
- "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
- "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
- "prefetcht0 192(%5,%0,8) \n\t"
- "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1
- "prefetcht0 192(%6,%0,8) \n\t"
- "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2
- "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1
- "prefetcht0 192(%7,%0,8) \n\t"
- "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3
- "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
- "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
-
- "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
-
- "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
- "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1
- "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
- "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
- "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1
- "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1
- "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2
- "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3
- "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
- "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
-
- "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
- "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
- "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
-
- "addq $8 , %0 \n\t"
- "subq $4 , %1 \n\t"
- "jnz .L01LOOP%= \n\t"
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
- "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
- "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t"
- "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
- "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
- "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t"
- "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t"
- "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t"
- "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t"
-#else
- "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
- "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t"
- "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
- "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
- "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t"
- "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t"
- "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t"
- "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t"
- "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
- "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t"
- "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
- "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
-#endif
-
- "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
- "vextractf128 $1, %%ymm10, %%xmm11 \n\t"
- "vextractf128 $1, %%ymm12, %%xmm13 \n\t"
- "vextractf128 $1, %%ymm14, %%xmm15 \n\t"
-
- "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
- "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t"
- "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t"
- "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t"
-
- "vmovups %%xmm8 , (%3) \n\t"
- "vmovups %%xmm10, 16(%3) \n\t"
- "vmovups %%xmm12, 32(%3) \n\t"
- "vmovups %%xmm14, 48(%3) \n\t"
-
- "vzeroupper \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]) // 7
- : "cc",
- "%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
- "memory"
- );
-
-}
-
-
diff --git a/kernel/x86_64/zgemv_t_microk_haswell-4.c b/kernel/x86_64/zgemv_t_microk_haswell-4.c
new file mode 100644
index 000000000..c87b5ce0f
--- /dev/null
+++ b/kernel/x86_64/zgemv_t_microk_haswell-4.c
@@ -0,0 +1,428 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary froms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary from must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
+ "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
+ "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
+ "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
+ "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" // temp
+ "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
+ "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
+ "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "prefetcht0 192(%2,%0,8) \n\t"
+ "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1
+ "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+ "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
+ "prefetcht0 192(%5,%0,8) \n\t"
+ "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1
+ "prefetcht0 192(%6,%0,8) \n\t"
+ "vmovups (%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2
+ "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1
+ "prefetcht0 192(%7,%0,8) \n\t"
+ "vmovups (%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3
+ "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
+ "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
+
+ "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
+ "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1
+ "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+ "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1
+ "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1
+ "vmovups 32(%6,%0,8), %%ymm6 \n\t" // 2 complex values from a2
+ "vmovups 32(%7,%0,8), %%ymm7 \n\t" // 2 complex values from a3
+ "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
+ "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
+
+ "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm6 , %%ymm0, %%ymm12 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm6 , %%ymm1, %%ymm13 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm7 , %%ymm0, %%ymm14 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm7 , %%ymm1, %%ymm15 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovddup (%8) , %%xmm0 \n\t" // value from alpha
+ "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t"
+ "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t"
+ "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t"
+ "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t"
+ "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t"
+ "vaddsubpd %%ymm13, %%ymm12, %%ymm12 \n\t"
+ "vaddsubpd %%ymm15, %%ymm14, %%ymm14 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t"
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+ "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t"
+ "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t"
+ "vaddsubpd %%ymm12, %%ymm13, %%ymm12 \n\t"
+ "vaddsubpd %%ymm14, %%ymm15, %%ymm14 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t"
+ "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t"
+ "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t"
+#endif
+
+ "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
+ "vextractf128 $1, %%ymm10, %%xmm11 \n\t"
+ "vextractf128 $1, %%ymm12, %%xmm13 \n\t"
+ "vextractf128 $1, %%ymm14, %%xmm15 \n\t"
+
+ "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vaddpd %%xmm12, %%xmm13, %%xmm12 \n\t"
+ "vaddpd %%xmm14, %%xmm15, %%xmm14 \n\t"
+
+ "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm12, %%xmm1 , %%xmm13 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm12, %%xmm0 , %%xmm12 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm14, %%xmm1 , %%xmm15 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm14, %%xmm0 , %%xmm14 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t"
+ "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t"
+ "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t"
+ "vaddsubpd %%xmm13, %%xmm12, %%xmm12 \n\t"
+ "vaddsubpd %%xmm15, %%xmm14, %%xmm14 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t"
+ "vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t"
+ "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t"
+#endif
+
+ "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t"
+ "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t"
+ "vaddpd 32(%3) , %%xmm12, %%xmm12 \n\t"
+ "vaddpd 48(%3) , %%xmm14, %%xmm14 \n\t"
+
+ "vmovups %%xmm8 , (%3) \n\t"
+ "vmovups %%xmm10, 16(%3) \n\t"
+ "vmovups %%xmm12, 32(%3) \n\t"
+ "vmovups %%xmm14, 48(%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+#define HAVE_KERNEL_4x2 1
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
+ "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
+ "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" // temp
+ "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "prefetcht0 192(%2,%0,8) \n\t"
+ "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovups (%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1
+ "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+ "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
+ "prefetcht0 192(%5,%0,8) \n\t"
+ "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1
+ "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1
+ "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
+ "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
+
+ "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
+ "vmovups 32(%5,%0,8), %%ymm5 \n\t" // 2 complex values from a1
+ "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+ "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1
+ "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1
+ "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
+ "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
+
+ "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+ "vfmadd231pd %%ymm5 , %%ymm0, %%ymm10 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm5 , %%ymm1, %%ymm11 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovddup (%6) , %%xmm0 \n\t" // value from alpha
+ "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+ "vpermilpd $0x5 , %%ymm11, %%ymm11 \n\t"
+ "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t"
+ "vaddsubpd %%ymm11, %%ymm10, %%ymm10 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t"
+ "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t"
+ "vaddsubpd %%ymm10, %%ymm11, %%ymm10 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm10, %%ymm10 \n\t"
+#endif
+
+ "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
+ "vextractf128 $1, %%ymm10, %%xmm11 \n\t"
+
+ "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+
+ "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+ "vmulpd %%xmm10, %%xmm1 , %%xmm11 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm10, %%xmm0 , %%xmm10 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vpermilpd $0x1 , %%xmm11, %%xmm11 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+ "vaddsubpd %%xmm11, %%xmm10, %%xmm10 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t"
+#endif
+
+ "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t"
+ "vaddpd 16(%3) , %%xmm10, %%xmm10 \n\t"
+
+ "vmovups %%xmm8 , (%3) \n\t"
+ "vmovups %%xmm10, 16(%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (alpha) // 6
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+#define HAVE_KERNEL_4x1 1
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+
+ "vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" // temp
+ "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp
+
+ ".align 16 \n\t"
+ ".L01LOOP%=: \n\t"
+
+ "prefetcht0 192(%2,%0,8) \n\t"
+ "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "prefetcht0 192(%4,%0,8) \n\t"
+ "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+ "vmovups (%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
+ "vmovddup 16(%2,%0,8), %%xmm2 \n\t" // real value from x1
+ "vmovddup 24(%2,%0,8), %%xmm3 \n\t" // imag value from x1
+ "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
+ "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
+
+ "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "vmovups 32(%4,%0,8), %%ymm4 \n\t" // 2 complex values from a0
+ "vmovddup 32(%2,%0,8), %%xmm0 \n\t" // real value from x0
+ "vmovddup 40(%2,%0,8), %%xmm1 \n\t" // imag value from x0
+ "vmovddup 48(%2,%0,8), %%xmm2 \n\t" // real value from x1
+ "vmovddup 56(%2,%0,8), %%xmm3 \n\t" // imag value from x1
+ "vinsertf128 $1, %%xmm2, %%ymm0 , %%ymm0 \n\t" // real values from x0 and x1
+ "vinsertf128 $1, %%xmm3, %%ymm1 , %%ymm1 \n\t" // imag values from x0 and x1
+
+ "vfmadd231pd %%ymm4 , %%ymm0, %%ymm8 \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1
+ "vfmadd231pd %%ymm4 , %%ymm1, %%ymm9 \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1
+
+ "addq $8 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+ "jnz .L01LOOP%= \n\t"
+
+ "vmovddup (%5) , %%xmm0 \n\t" // value from alpha
+ "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t"
+ "vaddsubpd %%ymm9 , %%ymm8, %%ymm8 \n\t"
+#else
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+ "vaddsubpd %%ymm8 , %%ymm9 , %%ymm8 \n\t"
+ "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t"
+#endif
+
+ "vextractf128 $1, %%ymm8 , %%xmm9 \n\t"
+
+ "vaddpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+
+ "vmulpd %%xmm8 , %%xmm1 , %%xmm9 \n\t" // t_r * alpha_i , t_i * alpha_i
+ "vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+ "vpermilpd $0x1 , %%xmm9 , %%xmm9 \n\t"
+ "vaddsubpd %%xmm9 , %%xmm8, %%xmm8 \n\t"
+#else
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+ "vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t"
+ "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t"
+#endif
+
+ "vaddpd (%3) , %%xmm8 , %%xmm8 \n\t"
+
+ "vmovups %%xmm8 , (%3) \n\t"
+
+ "vzeroupper \n\t"
+
+ :
+ :
+ "r" (i), // 0
+ "r" (n), // 1
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap), // 4
+ "r" (alpha) // 5
+ : "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
diff --git a/lapack-netlib/TESTING/dstest.in b/lapack-netlib/TESTING/dstest.in
index 4a31076a6..b5a9f29f4 100644
--- a/lapack-netlib/TESTING/dstest.in
+++ b/lapack-netlib/TESTING/dstest.in
@@ -1,6 +1,6 @@
Data file for testing DSGESV/DSPOSV LAPACK routines
12 Number of values of M
-0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension)
+0 1 2 13 17 45 78 91 101 119 112 132 values of M (row dimension)
6 Number of values of NRHS
1 2 14 15 16 13 Values of NRHS (number of right hand sides)
30.0 Threshold value of test ratio
diff --git a/param.h b/param.h
index 82f4ad842..3e20f5882 100644
--- a/param.h
+++ b/param.h
@@ -289,6 +289,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_Q 224
#define XGEMM_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_P 448
+#define ZGEMM3M_DEFAULT_P 224
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 224
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
#define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r
@@ -371,6 +381,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define QGEMM_DEFAULT_Q 224
#define XGEMM_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_P 448
+#define ZGEMM3M_DEFAULT_P 224
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 224
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
#define SGEMM_DEFAULT_R 12288
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R 12288
@@ -1073,10 +1093,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GETRF_FACTOR 0.72
-#define CGEMM3M_DEFAULT_UNROLL_N 4
-#define CGEMM3M_DEFAULT_UNROLL_M 8
-#define ZGEMM3M_DEFAULT_UNROLL_N 2
-#define ZGEMM3M_DEFAULT_UNROLL_M 8
#endif
@@ -1152,10 +1168,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128
-#define CGEMM3M_DEFAULT_UNROLL_N 4
-#define CGEMM3M_DEFAULT_UNROLL_M 8
-#define ZGEMM3M_DEFAULT_UNROLL_N 2
-#define ZGEMM3M_DEFAULT_UNROLL_M 8
+#define CGEMM3M_DEFAULT_UNROLL_N 8
+#define CGEMM3M_DEFAULT_UNROLL_M 4
+#define ZGEMM3M_DEFAULT_UNROLL_N 8
+#define ZGEMM3M_DEFAULT_UNROLL_M 2
+
+#define CGEMM3M_DEFAULT_P 448
+#define ZGEMM3M_DEFAULT_P 224
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 224
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
+
#define GETRF_FACTOR 0.72
@@ -1259,10 +1287,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_R xgemm_r
#define XGEMM_DEFAULT_Q 128
-#define CGEMM3M_DEFAULT_UNROLL_N 4
-#define CGEMM3M_DEFAULT_UNROLL_M 8
-#define ZGEMM3M_DEFAULT_UNROLL_N 2
-#define ZGEMM3M_DEFAULT_UNROLL_M 8
+#define CGEMM3M_DEFAULT_UNROLL_N 8
+#define CGEMM3M_DEFAULT_UNROLL_M 4
+#define ZGEMM3M_DEFAULT_UNROLL_N 8
+#define ZGEMM3M_DEFAULT_UNROLL_M 2
+
+#define CGEMM3M_DEFAULT_P 448
+#define ZGEMM3M_DEFAULT_P 224
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 224
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
#endif
diff --git a/test/Makefile b/test/Makefile
index 801efe244..75ea6de60 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -88,6 +88,31 @@ else
endif
endif
+
+level3_3m : zblat3_3m cblat3_3m
+ rm -f ?BLAT3_3M.SUMM
+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
+ @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat
+ @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+ifdef SMP
+ rm -f ?BLAT3_3M.SUMM
+ifeq ($(USE_OPENMP), 1)
+ OMP_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat
+ @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
+ OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat
+ @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+else
+ OPENBLAS_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat
+ @$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
+ OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat
+ @$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+endif
+endif
+
+
+
+
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =
@@ -131,6 +156,15 @@ cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME)
zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME)
$(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME)
+ $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+
+zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME)
+ $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+
+
+
+
clean:
@rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \
sblat1 dblat1 cblat1 zblat1 \
@@ -139,6 +173,8 @@ clean:
sblat1p dblat1p cblat1p zblat1p \
sblat2p dblat2p cblat2p zblat2p \
sblat3p dblat3p cblat3p zblat3p \
+ zblat3_3m zblat3_3mp \
+ cblat3_3m cblat3_3mp \
*.stackdump *.dll
libs:
diff --git a/test/cblat3_3m.dat b/test/cblat3_3m.dat
new file mode 100644
index 000000000..cc1a2cef4
--- /dev/null
+++ b/test/cblat3_3m.dat
@@ -0,0 +1,23 @@
+'CBLAT3_3M.SUMM' NAME OF SUMMARY OUTPUT FILE
+6 UNIT NUMBER OF SUMMARY FILE
+'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F LOGICAL FLAG, T TO STOP ON FAILURES.
+F LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0 THRESHOLD VALUE OF TEST RATIO
+6 NUMBER OF VALUES OF N
+0 1 2 3 7 31 63 VALUES OF N
+3 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+3 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS.
+CHEMM F PUT F FOR NO TEST. SAME COLUMNS.
+CSYMM F PUT F FOR NO TEST. SAME COLUMNS.
+CTRMM F PUT F FOR NO TEST. SAME COLUMNS.
+CTRSM F PUT F FOR NO TEST. SAME COLUMNS.
+CHERK F PUT F FOR NO TEST. SAME COLUMNS.
+CSYRK F PUT F FOR NO TEST. SAME COLUMNS.
+CHER2K F PUT F FOR NO TEST. SAME COLUMNS.
+CSYR2K F PUT F FOR NO TEST. SAME COLUMNS.
diff --git a/test/cblat3_3m.f b/test/cblat3_3m.f
new file mode 100644
index 000000000..19f7830be
--- /dev/null
+++ b/test/cblat3_3m.f
@@ -0,0 +1,3442 @@
+ PROGRAM CBLAT3
+*
+* Test program for the COMPLEX Level 3 Blas.
+*
+* The program must be driven by a short data file. The first 14 records
+* of the file are read using list-directed input, the last 9 records
+* are read using the format ( A8, L2 ). An annotated example of a data
+* file can be obtained by deleting the first 3 characters from the
+* following 23 lines:
+* 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE
+* 6 UNIT NUMBER OF SUMMARY FILE
+* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+* F LOGICAL FLAG, T TO STOP ON FAILURES.
+* T LOGICAL FLAG, T TO TEST ERROR EXITS.
+* 16.0 THRESHOLD VALUE OF TEST RATIO
+* 6 NUMBER OF VALUES OF N
+* 0 1 2 3 5 9 VALUES OF N
+* 3 NUMBER OF VALUES OF ALPHA
+* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+* 3 NUMBER OF VALUES OF BETA
+* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+* CGEMM3M T PUT F FOR NO TEST. SAME COLUMNS.
+* CHEMM T PUT F FOR NO TEST. SAME COLUMNS.
+* CSYMM T PUT F FOR NO TEST. SAME COLUMNS.
+* CTRMM T PUT F FOR NO TEST. SAME COLUMNS.
+* CTRSM T PUT F FOR NO TEST. SAME COLUMNS.
+* CHERK T PUT F FOR NO TEST. SAME COLUMNS.
+* CSYRK T PUT F FOR NO TEST. SAME COLUMNS.
+* CHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+* CSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*
+* See:
+*
+* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+* A Set of Level 3 Basic Linear Algebra Subprograms.
+*
+* Technical Memorandum No.88 (Revision 1), Mathematics and
+* Computer Science Division, Argonne National Laboratory, 9700
+* South Cass Avenue, Argonne, Illinois 60439, US.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ INTEGER NIN
+ PARAMETER ( NIN = 5 )
+ INTEGER NSUBS
+ PARAMETER ( NSUBS = 9 )
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ REAL RZERO, RHALF, RONE
+ PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 )
+ INTEGER NMAX
+ PARAMETER ( NMAX = 65 )
+ INTEGER NIDMAX, NALMAX, NBEMAX
+ PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+* .. Local Scalars ..
+ REAL EPS, ERR, THRESH
+ INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA
+ LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+ $ TSTERR
+ CHARACTER*1 TRANSA, TRANSB
+ CHARACTER*8 SNAMET
+ CHARACTER*32 SNAPS, SUMMRY
+* .. Local Arrays ..
+ COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+ $ ALF( NALMAX ), AS( NMAX*NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBEMAX ),
+ $ BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDMAX )
+ LOGICAL LTEST( NSUBS )
+ CHARACTER*8 SNAMES( NSUBS )
+* .. External Functions ..
+ REAL SDIFF
+ LOGICAL LCE
+ EXTERNAL SDIFF, LCE
+* .. External Subroutines ..
+ EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHKE, CMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+ CHARACTER*8 SRNAMT
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+ COMMON /SRNAMC/SRNAMT
+* .. Data statements ..
+ DATA SNAMES/'CGEMM3M ', 'CHEMM ', 'CSYMM ',
+ $ 'CTRMM ',
+ $ 'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K',
+ $ 'CSYR2K'/
+* .. Executable Statements ..
+*
+* Read name and unit number for summary output file and open file.
+*
+ READ( NIN, FMT = * )SUMMRY
+ READ( NIN, FMT = * )NOUT
+ OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+ NOUTC = NOUT
+*
+* Read name and unit number for snapshot output file and open file.
+*
+ READ( NIN, FMT = * )SNAPS
+ READ( NIN, FMT = * )NTRA
+ TRACE = NTRA.GE.0
+ IF( TRACE )THEN
+ OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+ END IF
+* Read the flag that directs rewinding of the snapshot file.
+ READ( NIN, FMT = * )REWI
+ REWI = REWI.AND.TRACE
+* Read the flag that directs stopping on any failure.
+ READ( NIN, FMT = * )SFATAL
+* Read the flag that indicates whether error exits are to be tested.
+ READ( NIN, FMT = * )TSTERR
+* Read the threshold value of the test ratio
+ READ( NIN, FMT = * )THRESH
+*
+* Read and check the parameter values for the tests.
+*
+* Values of N
+ READ( NIN, FMT = * )NIDIM
+ IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+ DO 10 I = 1, NIDIM
+ IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+ WRITE( NOUT, FMT = 9996 )NMAX
+ GO TO 220
+ END IF
+ 10 CONTINUE
+* Values of ALPHA
+ READ( NIN, FMT = * )NALF
+ IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+* Values of BETA
+ READ( NIN, FMT = * )NBET
+ IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+* Report values of parameters.
+*
+ WRITE( NOUT, FMT = 9995 )
+ WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+ WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+ WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+ IF( .NOT.TSTERR )THEN
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9984 )
+ END IF
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9999 )THRESH
+ WRITE( NOUT, FMT = * )
+*
+* Read names of subroutines and flags which indicate
+* whether they are to be tested.
+*
+ DO 20 I = 1, NSUBS
+ LTEST( I ) = .FALSE.
+ 20 CONTINUE
+ 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+ DO 40 I = 1, NSUBS
+ IF( SNAMET.EQ.SNAMES( I ) )
+ $ GO TO 50
+ 40 CONTINUE
+ WRITE( NOUT, FMT = 9990 )SNAMET
+ STOP
+ 50 LTEST( I ) = LTESTT
+ GO TO 30
+*
+ 60 CONTINUE
+ CLOSE ( NIN )
+*
+* Compute EPS (the machine precision).
+*
+ EPS = RONE
+ 70 CONTINUE
+ IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO )
+ $ GO TO 80
+ EPS = RHALF*EPS
+ GO TO 70
+ 80 CONTINUE
+ EPS = EPS + EPS
+ WRITE( NOUT, FMT = 9998 )EPS
+*
+* Check the reliability of CMMCH using exact data.
+*
+ N = MIN( 32, NMAX )
+ DO 100 J = 1, N
+ DO 90 I = 1, N
+ AB( I, J ) = MAX( I - J + 1, 0 )
+ 90 CONTINUE
+ AB( J, NMAX + 1 ) = J
+ AB( 1, NMAX + J ) = J
+ C( J, 1 ) = ZERO
+ 100 CONTINUE
+ DO 110 J = 1, N
+ CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+ 110 CONTINUE
+* CC holds the exact result. On exit from CMMCH CT holds
+* the result computed by CMMCH.
+ TRANSA = 'N'
+ TRANSB = 'N'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ DO 120 J = 1, N
+ AB( J, NMAX + 1 ) = N - J + 1
+ AB( 1, NMAX + J ) = N - J + 1
+ 120 CONTINUE
+ DO 130 J = 1, N
+ CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+ $ ( ( J + 1 )*J*( J - 1 ) )/3
+ 130 CONTINUE
+ TRANSA = 'C'
+ TRANSB = 'N'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LCE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+*
+* Test each subroutine in turn.
+*
+ DO 200 ISNUM = 1, NSUBS
+ WRITE( NOUT, FMT = * )
+ IF( .NOT.LTEST( ISNUM ) )THEN
+* Subprogram is not to be tested.
+ WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+ ELSE
+ SRNAMT = SNAMES( ISNUM )
+* Test error exits.
+ IF( TSTERR )THEN
+ CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+ WRITE( NOUT, FMT = * )
+ END IF
+* Test computations.
+ INFOT = 0
+ OK = .TRUE.
+ FATAL = .FALSE.
+ GO TO ( 140, 150, 150, 160, 160, 170, 170,
+ $ 180, 180 )ISNUM
+* Test CGEMM3M, 01.
+ 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G )
+ GO TO 190
+* Test CHEMM, 02, CSYMM, 03.
+ 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G )
+ GO TO 190
+* Test CTRMM, 04, CTRSM, 05.
+ 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+ $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C )
+ GO TO 190
+* Test CHERK, 06, CSYRK, 07.
+ 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G )
+ GO TO 190
+* Test CHER2K, 08, CSYR2K, 09.
+ 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+ GO TO 190
+*
+ 190 IF( FATAL.AND.SFATAL )
+ $ GO TO 210
+ END IF
+ 200 CONTINUE
+ WRITE( NOUT, FMT = 9986 )
+ GO TO 230
+*
+ 210 CONTINUE
+ WRITE( NOUT, FMT = 9985 )
+ GO TO 230
+*
+ 220 CONTINUE
+ WRITE( NOUT, FMT = 9991 )
+*
+ 230 CONTINUE
+ IF( TRACE )
+ $ CLOSE ( NTRA )
+ CLOSE ( NOUT )
+ STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+ $ 'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+ $ 'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F',
+ $ 'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( ' FOR N ', 9I6 )
+ 9993 FORMAT( ' FOR ALPHA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9992 FORMAT( ' FOR BETA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+ $ /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT( ' SUBPROGRAM NAME ', A8, ' NOT RECOGNIZED', /' ******* T',
+ $ 'ESTS ABANDONED *******' )
+ 9989 FORMAT( ' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU',
+ $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1,
+ $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+ $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+ $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+ $ '*******' )
+ 9988 FORMAT( A8, L2 )
+ 9987 FORMAT( 1X, A8, ' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+* End of CBLAT3.
+*
+ END
+ SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+* Tests CGEMM3M.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BLS
+ REAL ERR, ERRMAX
+ INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+ $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+ $ MA, MB, MS, N, NA, NARGS, NB, NC, NS
+ LOGICAL NULL, RESET, SAME, TRANA, TRANB
+ CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB
+ CHARACTER*3 ICH
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CGEMM3M, CMAKE, CMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICH/'NTC'/
+* .. Executable Statements ..
+*
+ NARGS = 13
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 110 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICA = 1, 3
+ TRANSA = ICH( ICA: ICA )
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+ IF( TRANA )THEN
+ MA = K
+ NA = M
+ ELSE
+ MA = M
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICB = 1, 3
+ TRANSB = ICH( ICB: ICB )
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+ IF( TRANB )THEN
+ MB = N
+ NB = K
+ ELSE
+ MB = K
+ NB = N
+ END IF
+* Set LDB to 1 more than minimum value if room.
+ LDB = MB
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 70
+ LBB = LDB*NB
+*
+* Generate the matrix B.
+*
+ CALL CMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB,
+ $ LDB, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX,
+ $ CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ TRANAS = TRANSA
+ TRANBS = TRANSB
+ MS = M
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME,
+ $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB,
+ $ BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CGEMM3M( TRANSA, TRANSB, M, N, K, ALPHA,
+ $ AA, LDA, BB, LDB, BETA, CC, LDC )
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = TRANSA.EQ.TRANAS
+ ISAME( 2 ) = TRANSB.EQ.TRANBS
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = KS.EQ.K
+ ISAME( 6 ) = ALS.EQ.ALPHA
+ ISAME( 7 ) = LCE( AS, AA, LAA )
+ ISAME( 8 ) = LDAS.EQ.LDA
+ ISAME( 9 ) = LCE( BS, BB, LBB )
+ ISAME( 10 ) = LDBS.EQ.LDB
+ ISAME( 11 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 12 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 12 ) = LCERES( 'GE', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 13 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report
+* and return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ CALL CMMCH( TRANSA, TRANSB, M, N, K,
+ $ ALPHA, A, NMAX, B, NMAX, BETA,
+ $ C, NMAX, CT, G, CC, LDC, EPS,
+ $ ERR, FATAL, NOUT, .TRUE. )
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 120
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K,
+ $ ALPHA, LDA, LDB, BETA, LDC
+*
+ 130 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A8, '(''', A1, ''',''', A1, ''',',
+ $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3,
+ $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK1.
+*
+ END
+ SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+* Tests CHEMM and CSYMM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BLS
+ REAL ERR, ERRMAX
+ INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+ $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, LEFT, NULL, RESET, SAME
+ CHARACTER*1 SIDE, SIDES, UPLO, UPLOS
+ CHARACTER*2 ICHS, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CHEMM, CMAKE, CMMCH, CSYMM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHS/'LR'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 100 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 90 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 90
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 90
+ LBB = LDB*N
+*
+* Generate the matrix B.
+*
+ CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+ $ ZERO )
+*
+ DO 80 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+*
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+* Generate the hermitian or symmetric matrix A.
+*
+ CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX,
+ $ AA, LDA, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC,
+ $ LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE,
+ $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ IF( CONJ )THEN
+ CALL CHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+ $ BB, LDB, BETA, CC, LDC )
+ ELSE
+ CALL CSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+ $ BB, LDB, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LCE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LCE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ ISAME( 10 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 11 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LCERES( 'GE', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A,
+ $ NMAX, B, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B,
+ $ NMAX, A, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 120
+*
+ 110 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA,
+ $ LDB, BETA, LDC
+*
+ 120 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK2.
+*
+ END
+ SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+ $ B, BB, BS, CT, G, C )
+*
+* Tests CTRMM and CTRSM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS
+ REAL ERR, ERRMAX
+ INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+ $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+ $ NS
+ LOGICAL LEFT, NULL, RESET, SAME
+ CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+ $ UPLOS
+ CHARACTER*2 ICHD, ICHS, ICHU
+ CHARACTER*3 ICHT
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CMAKE, CMMCH, CTRMM, CTRSM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+* .. Executable Statements ..
+*
+ NARGS = 11
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+* Set up zero matrix for CMMCH.
+ DO 20 J = 1, NMAX
+ DO 10 I = 1, NMAX
+ C( I, J ) = ZERO
+ 10 CONTINUE
+ 20 CONTINUE
+*
+ DO 140 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 130
+ LBB = LDB*N
+ NULL = M.LE.0.OR.N.LE.0
+*
+ DO 120 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 130
+ LAA = LDA*NA
+*
+ DO 110 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+ DO 100 ICT = 1, 3
+ TRANSA = ICHT( ICT: ICT )
+*
+ DO 90 ICD = 1, 2
+ DIAG = ICHD( ICD: ICD )
+*
+ DO 80 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+* Generate the matrix A.
+*
+ CALL CMAKE( 'TR', UPLO, DIAG, NA, NA, A,
+ $ NMAX, AA, LDA, RESET, ZERO )
+*
+* Generate the matrix B.
+*
+ CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX,
+ $ BB, LDB, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ TRANAS = TRANSA
+ DIAGS = DIAG
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 30 I = 1, LAA
+ AS( I ) = AA( I )
+ 30 CONTINUE
+ LDAS = LDA
+ DO 40 I = 1, LBB
+ BS( I ) = BB( I )
+ 40 CONTINUE
+ LDBS = LDB
+*
+* Call the subroutine.
+*
+ IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CTRMM( SIDE, UPLO, TRANSA, DIAG, M,
+ $ N, ALPHA, AA, LDA, BB, LDB )
+ ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M,
+ $ N, ALPHA, AA, LDA, BB, LDB )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = TRANAS.EQ.TRANSA
+ ISAME( 4 ) = DIAGS.EQ.DIAG
+ ISAME( 5 ) = MS.EQ.M
+ ISAME( 6 ) = NS.EQ.N
+ ISAME( 7 ) = ALS.EQ.ALPHA
+ ISAME( 8 ) = LCE( AS, AA, LAA )
+ ISAME( 9 ) = LDAS.EQ.LDA
+ IF( NULL )THEN
+ ISAME( 10 ) = LCE( BS, BB, LBB )
+ ELSE
+ ISAME( 10 ) = LCERES( 'GE', ' ', M, N, BS,
+ $ BB, LDB )
+ END IF
+ ISAME( 11 ) = LDBS.EQ.LDB
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 50 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 50 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+ IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL CMMCH( TRANSA, 'N', M, N, M,
+ $ ALPHA, A, NMAX, B, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL CMMCH( 'N', TRANSA, M, N, N,
+ $ ALPHA, B, NMAX, A, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+*
+* Compute approximation to original
+* matrix.
+*
+ DO 70 J = 1, N
+ DO 60 I = 1, M
+ C( I, J ) = BB( I + ( J - 1 )*
+ $ LDB )
+ BB( I + ( J - 1 )*LDB ) = ALPHA*
+ $ B( I, J )
+ 60 CONTINUE
+ 70 CONTINUE
+*
+ IF( LEFT )THEN
+ CALL CMMCH( TRANSA, 'N', M, N, M,
+ $ ONE, A, NMAX, C, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ ELSE
+ CALL CMMCH( 'N', TRANSA, M, N, N,
+ $ ONE, C, NMAX, A, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ END IF
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 150
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+ 140 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M,
+ $ N, ALPHA, LDA, LDB
+*
+ 160 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A8, '(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ',
+ $ ' .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK3.
+*
+ END
+ SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+* Tests CHERK and CSYRK.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RONE, RZERO
+ PARAMETER ( RONE = 1.0, RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BETS
+ REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+ $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CHERK, CMAKE, CMMCH, CSYRK
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX, MAX, REAL
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+ NARGS = 10
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+ RALS = RONE
+ RBETS = RONE
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+ IF( CONJ )THEN
+ RALPHA = REAL( ALPHA )
+ ALPHA = CMPLX( RALPHA, RZERO )
+ END IF
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = REAL( BETA )
+ BETA = CMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ.
+ $ RZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ IF( CONJ )THEN
+ RALS = RALPHA
+ ELSE
+ ALS = ALPHA
+ END IF
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 20 I = 1, LCC
+ CS( I ) = CC( I )
+ 20 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+ $ TRANS, N, K, RALPHA, LDA, RBETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CHERK( UPLO, TRANS, N, K, RALPHA, AA,
+ $ LDA, RBETA, CC, LDC )
+ ELSE
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+ $ TRANS, N, K, ALPHA, LDA, BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CSYRK( UPLO, TRANS, N, K, ALPHA, AA,
+ $ LDA, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ IF( CONJ )THEN
+ ISAME( 5 ) = RALS.EQ.RALPHA
+ ELSE
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ END IF
+ ISAME( 6 ) = LCE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ IF( CONJ )THEN
+ ISAME( 8 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 8 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 9 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 9 ) = LCERES( SNAME( 2: 3 ), UPLO, N,
+ $ N, CS, CC, LDC )
+ END IF
+ ISAME( 10 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 30 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 30 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JC = 1
+ DO 40 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ CALL CMMCH( TRANST, 'N', LJ, 1, K,
+ $ ALPHA, A( 1, JJ ), NMAX,
+ $ A( 1, J ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL CMMCH( 'N', TRANST, LJ, 1, K,
+ $ ALPHA, A( JJ, 1 ), NMAX,
+ $ A( J, 1 ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ 40 CONTINUE
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 110 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA,
+ $ LDA, RBETA, LDC
+ ELSE
+ WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, BETA, LDC
+ END IF
+*
+ 130 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ',
+ $ ' .' )
+ 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1,
+ $ '), C,', I3, ') .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK4.
+*
+ END
+ SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+*
+* Tests CHER2K and CSYR2K.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ REAL RONE, RZERO
+ PARAMETER ( RONE = 1.0, RZERO = 0.0 )
+* .. Scalar Arguments ..
+ REAL EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+ $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+ $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ REAL G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX ALPHA, ALS, BETA, BETS
+ REAL ERR, ERRMAX, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+ $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+ $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LCE, LCERES
+ EXTERNAL LCE, LCERES
+* .. External Subroutines ..
+ EXTERNAL CHER2K, CMAKE, CMMCH, CSYR2K
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX, CONJG, MAX, REAL
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 130
+ LCC = LDC*N
+*
+ DO 120 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 110 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 110
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ IF( TRAN )THEN
+ CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+ $ LDA, RESET, ZERO )
+ ELSE
+ CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+ $ RESET, ZERO )
+ END IF
+*
+* Generate the matrix B.
+*
+ LDB = LDA
+ LBB = LAA
+ IF( TRAN )THEN
+ CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ),
+ $ 2*NMAX, BB, LDB, RESET, ZERO )
+ ELSE
+ CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+ $ NMAX, BB, LDB, RESET, ZERO )
+ END IF
+*
+ DO 100 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 90 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 80 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = REAL( BETA )
+ BETA = CMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ.
+ $ ZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+ $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CHER2K( UPLO, TRANS, N, K, ALPHA, AA,
+ $ LDA, BB, LDB, RBETA, CC, LDC )
+ ELSE
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+ $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL CSYR2K( UPLO, TRANS, N, K, ALPHA, AA,
+ $ LDA, BB, LDB, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LCE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LCE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ IF( CONJ )THEN
+ ISAME( 10 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 10 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 11 ) = LCE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LCERES( 'HE', UPLO, N, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JJAB = 1
+ JC = 1
+ DO 70 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ DO 50 I = 1, K
+ W( I ) = ALPHA*AB( ( J - 1 )*2*
+ $ NMAX + K + I )
+ IF( CONJ )THEN
+ W( K + I ) = CONJG( ALPHA )*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ ELSE
+ W( K + I ) = ALPHA*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ END IF
+ 50 CONTINUE
+ CALL CMMCH( TRANST, 'N', LJ, 1, 2*K,
+ $ ONE, AB( JJAB ), 2*NMAX, W,
+ $ 2*NMAX, BETA, C( JJ, J ),
+ $ NMAX, CT, G, CC( JC ), LDC,
+ $ EPS, ERR, FATAL, NOUT,
+ $ .TRUE. )
+ ELSE
+ DO 60 I = 1, K
+ IF( CONJ )THEN
+ W( I ) = ALPHA*CONJG( AB( ( K +
+ $ I - 1 )*NMAX + J ) )
+ W( K + I ) = CONJG( ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J ) )
+ ELSE
+ W( I ) = ALPHA*AB( ( K + I - 1 )*
+ $ NMAX + J )
+ W( K + I ) = ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J )
+ END IF
+ 60 CONTINUE
+ CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE,
+ $ AB( JJ ), NMAX, W, 2*NMAX,
+ $ BETA, C( JJ, J ), NMAX, CT,
+ $ G, CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ IF( TRAN )
+ $ JJAB = JJAB + 2*NMAX
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 140
+ 70 CONTINUE
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 140 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, LDB, RBETA, LDC
+ ELSE
+ WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, LDB, BETA, LDC
+ END IF
+*
+ 160 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1,
+ $ ', C,', I3, ') .' )
+ 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of CCHK5.
+*
+ END
+ SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT )
+*
+* Tests the error exits from the Level 3 Blas.
+* Requires a special version of the error-handling routine XERBLA.
+* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER ISNUM, NOUT
+ CHARACTER*8 SRNAMT
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Local Scalars ..
+ COMPLEX ALPHA, BETA
+ REAL RALPHA, RBETA
+* .. Local Arrays ..
+ COMPLEX A( 2, 1 ), B( 2, 1 ), C( 2, 1 )
+* .. External Subroutines ..
+ EXTERNAL CGEMM3M, CHEMM, CHER2K, CHERK, CHKXER, CSYMM,
+ $ CSYR2K, CSYRK, CTRMM, CTRSM
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Executable Statements ..
+* OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+* if anything is wrong.
+ OK = .TRUE.
+* LERR is set to .TRUE. by the special version of XERBLA each time
+* it is called, and is then tested and re-set by CHKXER.
+ LERR = .FALSE.
+ GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+ $ 90 )ISNUM
+ 10 INFOT = 1
+ CALL CGEMM3M( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 1
+ CALL CGEMM3M( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 1
+ CALL CGEMM3M( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CGEMM3M( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CGEMM3M( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CGEMM3M( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CGEMM3M( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CGEMM3M( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CGEMM3M( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL CGEMM3M( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CGEMM3M( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL CGEMM3M( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 20 INFOT = 1
+ CALL CHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 30 INFOT = 1
+ CALL CSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 40 INFOT = 1
+ CALL CTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 50 INFOT = 1
+ CALL CTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL CTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL CTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL CTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 60 INFOT = 1
+ CALL CHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 70 INFOT = 1
+ CALL CSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL CSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 80 INFOT = 1
+ CALL CHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 90 INFOT = 1
+ CALL CSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL CSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL CSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL CSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL CSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+ 100 IF( OK )THEN
+ WRITE( NOUT, FMT = 9999 )SRNAMT
+ ELSE
+ WRITE( NOUT, FMT = 9998 )SRNAMT
+ END IF
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A8, ' FAILED THE TESTS OF ERROR-EXITS *****',
+ $ '**' )
+*
+* End of CCHKE.
+*
+ END
+ SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+ $ TRANSL )
+*
+* Generates values for an M by N matrix A.
+* Stores the values in the array AA in the data structure required
+* by the routine, with unwanted elements set to rogue value.
+*
+* TYPE is 'GE', 'HE', 'SY' or 'TR'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) )
+ COMPLEX ROGUE
+ PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) )
+ REAL RZERO
+ PARAMETER ( RZERO = 0.0 )
+ REAL RROGUE
+ PARAMETER ( RROGUE = -1.0E10 )
+* .. Scalar Arguments ..
+ COMPLEX TRANSL
+ INTEGER LDA, M, N, NMAX
+ LOGICAL RESET
+ CHARACTER*1 DIAG, UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX A( NMAX, * ), AA( * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J, JJ
+ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER
+* .. External Functions ..
+ COMPLEX CBEG
+ EXTERNAL CBEG
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX, CONJG, REAL
+* .. Executable Statements ..
+ GEN = TYPE.EQ.'GE'
+ HER = TYPE.EQ.'HE'
+ SYM = TYPE.EQ.'SY'
+ TRI = TYPE.EQ.'TR'
+ UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U'
+ LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L'
+ UNIT = TRI.AND.DIAG.EQ.'U'
+*
+* Generate data in array A.
+*
+ DO 20 J = 1, N
+ DO 10 I = 1, M
+ IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+ $ THEN
+ A( I, J ) = CBEG( RESET ) + TRANSL
+ IF( I.NE.J )THEN
+* Set some elements to zero
+ IF( N.GT.3.AND.J.EQ.N/2 )
+ $ A( I, J ) = ZERO
+ IF( HER )THEN
+ A( J, I ) = CONJG( A( I, J ) )
+ ELSE IF( SYM )THEN
+ A( J, I ) = A( I, J )
+ ELSE IF( TRI )THEN
+ A( J, I ) = ZERO
+ END IF
+ END IF
+ END IF
+ 10 CONTINUE
+ IF( HER )
+ $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO )
+ IF( TRI )
+ $ A( J, J ) = A( J, J ) + ONE
+ IF( UNIT )
+ $ A( J, J ) = ONE
+ 20 CONTINUE
+*
+* Store elements in array AS in data structure required by routine.
+*
+ IF( TYPE.EQ.'GE' )THEN
+ DO 50 J = 1, N
+ DO 30 I = 1, M
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 30 CONTINUE
+ DO 40 I = M + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+ DO 90 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IF( UNIT )THEN
+ IEND = J - 1
+ ELSE
+ IEND = J
+ END IF
+ ELSE
+ IF( UNIT )THEN
+ IBEG = J + 1
+ ELSE
+ IBEG = J
+ END IF
+ IEND = N
+ END IF
+ DO 60 I = 1, IBEG - 1
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 60 CONTINUE
+ DO 70 I = IBEG, IEND
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 70 CONTINUE
+ DO 80 I = IEND + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 80 CONTINUE
+ IF( HER )THEN
+ JJ = J + ( J - 1 )*LDA
+ AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE )
+ END IF
+ 90 CONTINUE
+ END IF
+ RETURN
+*
+* End of CMAKE.
+*
+ END
+ SUBROUTINE CMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+ $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+ $ NOUT, MV )
+*
+* Checks the results of the computational tests.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX ZERO
+ PARAMETER ( ZERO = ( 0.0, 0.0 ) )
+ REAL RZERO, RONE
+ PARAMETER ( RZERO = 0.0, RONE = 1.0 )
+* .. Scalar Arguments ..
+ COMPLEX ALPHA, BETA
+ REAL EPS, ERR
+ INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+ LOGICAL FATAL, MV
+ CHARACTER*1 TRANSA, TRANSB
+* .. Array Arguments ..
+ COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ),
+ $ CC( LDCC, * ), CT( * )
+ REAL G( * )
+* .. Local Scalars ..
+ COMPLEX CL
+ REAL ERRI
+ INTEGER I, J, K
+ LOGICAL CTRANA, CTRANB, TRANA, TRANB
+* .. Intrinsic Functions ..
+ INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT
+* .. Statement Functions ..
+ REAL ABS1
+* .. Statement Function definitions ..
+ ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) )
+* .. Executable Statements ..
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+ CTRANA = TRANSA.EQ.'C'
+ CTRANB = TRANSB.EQ.'C'
+*
+* Compute expected result, one column at a time, in CT using data
+* in A, B and C.
+* Compute gauges in G.
+*
+ DO 220 J = 1, N
+*
+ DO 10 I = 1, M
+ CT( I ) = ZERO
+ G( I ) = RZERO
+ 10 CONTINUE
+ IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+ DO 30 K = 1, KK
+ DO 20 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) )
+ 20 CONTINUE
+ 30 CONTINUE
+ ELSE IF( TRANA.AND..NOT.TRANB )THEN
+ IF( CTRANA )THEN
+ DO 50 K = 1, KK
+ DO 40 I = 1, M
+ CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE
+ DO 70 K = 1, KK
+ DO 60 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 60 CONTINUE
+ 70 CONTINUE
+ END IF
+ ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+ IF( CTRANB )THEN
+ DO 90 K = 1, KK
+ DO 80 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 80 CONTINUE
+ 90 CONTINUE
+ ELSE
+ DO 110 K = 1, KK
+ DO 100 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 100 CONTINUE
+ 110 CONTINUE
+ END IF
+ ELSE IF( TRANA.AND.TRANB )THEN
+ IF( CTRANA )THEN
+ IF( CTRANB )THEN
+ DO 130 K = 1, KK
+ DO 120 I = 1, M
+ CT( I ) = CT( I ) + CONJG( A( K, I ) )*
+ $ CONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 120 CONTINUE
+ 130 CONTINUE
+ ELSE
+ DO 150 K = 1, KK
+ DO 140 I = 1, M
+ CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 140 CONTINUE
+ 150 CONTINUE
+ END IF
+ ELSE
+ IF( CTRANB )THEN
+ DO 170 K = 1, KK
+ DO 160 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 160 CONTINUE
+ 170 CONTINUE
+ ELSE
+ DO 190 K = 1, KK
+ DO 180 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 180 CONTINUE
+ 190 CONTINUE
+ END IF
+ END IF
+ END IF
+ DO 200 I = 1, M
+ CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+ G( I ) = ABS1( ALPHA )*G( I ) +
+ $ ABS1( BETA )*ABS1( C( I, J ) )
+ 200 CONTINUE
+*
+* Compute the error ratio for this result.
+*
+ ERR = ZERO
+ DO 210 I = 1, M
+ ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS
+ IF( G( I ).NE.RZERO )
+ $ ERRI = ERRI/G( I )
+ ERR = MAX( ERR, ERRI )
+ IF( ERR*SQRT( EPS ).GE.RONE )
+ $ GO TO 230
+ 210 CONTINUE
+*
+ 220 CONTINUE
+*
+* If the loop completes, all results are at least half accurate.
+ GO TO 250
+*
+* Report fatal error.
+*
+ 230 FATAL = .TRUE.
+ WRITE( NOUT, FMT = 9999 )
+ DO 240 I = 1, M
+ IF( MV )THEN
+ WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+ ELSE
+ WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+ END IF
+ 240 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9997 )J
+*
+ 250 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+ $ 'F ACCURATE *******', /' EXPECTED RE',
+ $ 'SULT COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) )
+ 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+* End of CMMCH.
+*
+ END
+ LOGICAL FUNCTION LCE( RI, RJ, LR )
+*
+* Tests if two arrays are identical.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LR
+* .. Array Arguments ..
+ COMPLEX RI( * ), RJ( * )
+* .. Local Scalars ..
+ INTEGER I
+* .. Executable Statements ..
+ DO 10 I = 1, LR
+ IF( RI( I ).NE.RJ( I ) )
+ $ GO TO 20
+ 10 CONTINUE
+ LCE = .TRUE.
+ GO TO 30
+ 20 CONTINUE
+ LCE = .FALSE.
+ 30 RETURN
+*
+* End of LCE.
+*
+ END
+ LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+* Tests if selected elements in two arrays are equal.
+*
+* TYPE is 'GE' or 'HE' or 'SY'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LDA, M, N
+ CHARACTER*1 UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX AA( LDA, * ), AS( LDA, * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J
+ LOGICAL UPPER
+* .. Executable Statements ..
+ UPPER = UPLO.EQ.'U'
+ IF( TYPE.EQ.'GE' )THEN
+ DO 20 J = 1, N
+ DO 10 I = M + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 10 CONTINUE
+ 20 CONTINUE
+ ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN
+ DO 50 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IEND = J
+ ELSE
+ IBEG = J
+ IEND = N
+ END IF
+ DO 30 I = 1, IBEG - 1
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 30 CONTINUE
+ DO 40 I = IEND + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 40 CONTINUE
+ 50 CONTINUE
+ END IF
+*
+ 60 CONTINUE
+ LCERES = .TRUE.
+ GO TO 80
+ 70 CONTINUE
+ LCERES = .FALSE.
+ 80 RETURN
+*
+* End of LCERES.
+*
+ END
+ COMPLEX FUNCTION CBEG( RESET )
+*
+* Generates complex numbers as pairs of random numbers uniformly
+* distributed between -0.5 and 0.5.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ LOGICAL RESET
+* .. Local Scalars ..
+ INTEGER I, IC, J, MI, MJ
+* .. Save statement ..
+ SAVE I, IC, J, MI, MJ
+* .. Intrinsic Functions ..
+ INTRINSIC CMPLX
+* .. Executable Statements ..
+ IF( RESET )THEN
+* Initialize local variables.
+ MI = 891
+ MJ = 457
+ I = 7
+ J = 7
+ IC = 0
+ RESET = .FALSE.
+ END IF
+*
+* The sequence of values of I or J is bounded between 1 and 999.
+* If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+* If initial I or J = 4 or 8, the period will be 25.
+* If initial I or J = 5, the period will be 10.
+* IC is used to break up the period by skipping 1 value of I or J
+* in 6.
+*
+ IC = IC + 1
+ 10 I = I*MI
+ J = J*MJ
+ I = I - 1000*( I/1000 )
+ J = J - 1000*( J/1000 )
+ IF( IC.GE.5 )THEN
+ IC = 0
+ GO TO 10
+ END IF
+ CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 )
+ RETURN
+*
+* End of CBEG.
+*
+ END
+ REAL FUNCTION SDIFF( X, Y )
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ REAL X, Y
+* .. Executable Statements ..
+ SDIFF = X - Y
+ RETURN
+*
+* End of SDIFF.
+*
+ END
+ SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+* Tests whether XERBLA has detected an error when it should.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER INFOT, NOUT
+ LOGICAL LERR, OK
+ CHARACTER*8 SRNAMT
+* .. Executable Statements ..
+ IF( .NOT.LERR )THEN
+ WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+ OK = .FALSE.
+ END IF
+ LERR = .FALSE.
+ RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+ $ 'ETECTED BY ', A8, ' *****' )
+*
+* End of CHKXER.
+*
+ END
+ SUBROUTINE XERBLA( SRNAME, INFO )
+*
+* This is a special version of XERBLA to be used only as part of
+* the test program for testing error exits from the Level 3 BLAS
+* routines.
+*
+* XERBLA is an error handler for the Level 3 BLAS routines.
+*
+* It is called by the Level 3 BLAS routines if an input parameter is
+* invalid.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER INFO
+ CHARACTER*8 SRNAME
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUT
+ LOGICAL LERR, OK
+ CHARACTER*8 SRNAMT
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUT, OK, LERR
+ COMMON /SRNAMC/SRNAMT
+* .. Executable Statements ..
+ LERR = .TRUE.
+ IF( INFO.NE.INFOT )THEN
+ IF( INFOT.NE.0 )THEN
+ WRITE( NOUT, FMT = 9999 )INFO, INFOT
+ ELSE
+ WRITE( NOUT, FMT = 9997 )INFO
+ END IF
+ OK = .FALSE.
+ END IF
+ IF( SRNAME.NE.SRNAMT )THEN
+ WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+ OK = .FALSE.
+ END IF
+ RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+ $ ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A8, ' INSTE',
+ $ 'AD OF ', A8, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+ $ ' *******' )
+*
+* End of XERBLA
+*
+ END
+
diff --git a/test/zblat3_3m.dat b/test/zblat3_3m.dat
new file mode 100644
index 000000000..f48cc19db
--- /dev/null
+++ b/test/zblat3_3m.dat
@@ -0,0 +1,23 @@
+'ZBLAT3_3M.SUMM' NAME OF SUMMARY OUTPUT FILE
+6 UNIT NUMBER OF SUMMARY FILE
+'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+F LOGICAL FLAG, T TO STOP ON FAILURES.
+F LOGICAL FLAG, T TO TEST ERROR EXITS.
+16.0 THRESHOLD VALUE OF TEST RATIO
+6 NUMBER OF VALUES OF N
+0 1 2 3 7 31 63 VALUES OF N
+3 NUMBER OF VALUES OF ALPHA
+(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+3 NUMBER OF VALUES OF BETA
+(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS.
+ZHEMM F PUT F FOR NO TEST. SAME COLUMNS.
+ZSYMM F PUT F FOR NO TEST. SAME COLUMNS.
+ZTRMM F PUT F FOR NO TEST. SAME COLUMNS.
+ZTRSM F PUT F FOR NO TEST. SAME COLUMNS.
+ZHERK F PUT F FOR NO TEST. SAME COLUMNS.
+ZSYRK F PUT F FOR NO TEST. SAME COLUMNS.
+ZHER2K F PUT F FOR NO TEST. SAME COLUMNS.
+ZSYR2K F PUT F FOR NO TEST. SAME COLUMNS.
diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f
new file mode 100644
index 000000000..bac23aa54
--- /dev/null
+++ b/test/zblat3_3m.f
@@ -0,0 +1,3448 @@
+ PROGRAM ZBLAT3
+*
+* Test program for the COMPLEX*16 Level 3 Blas.
+*
+* The program must be driven by a short data file. The first 14 records
+* of the file are read using list-directed input, the last 9 records
+* are read using the format ( A8, L2 ). An annotated example of a data
+* file can be obtained by deleting the first 3 characters from the
+* following 23 lines:
+* 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE
+* 6 UNIT NUMBER OF SUMMARY FILE
+* 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE
+* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
+* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
+* F LOGICAL FLAG, T TO STOP ON FAILURES.
+* T LOGICAL FLAG, T TO TEST ERROR EXITS.
+* 16.0 THRESHOLD VALUE OF TEST RATIO
+* 6 NUMBER OF VALUES OF N
+* 0 1 2 3 5 9 VALUES OF N
+* 3 NUMBER OF VALUES OF ALPHA
+* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA
+* 3 NUMBER OF VALUES OF BETA
+* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA
+* ZGEMM3M T PUT F FOR NO TEST. SAME COLUMNS.
+* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS.
+* ZHERK T PUT F FOR NO TEST. SAME COLUMNS.
+* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS.
+* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS.
+* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS.
+*
+* See:
+*
+* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S.
+* A Set of Level 3 Basic Linear Algebra Subprograms.
+*
+* Technical Memorandum No.88 (Revision 1), Mathematics and
+* Computer Science Division, Argonne National Laboratory, 9700
+* South Cass Avenue, Argonne, Illinois 60439, US.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ INTEGER NIN
+ PARAMETER ( NIN = 5 )
+ INTEGER NSUBS
+ PARAMETER ( NSUBS = 9 )
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ $ ONE = ( 1.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO, RHALF, RONE
+ PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 )
+ INTEGER NMAX
+ PARAMETER ( NMAX = 65 )
+ INTEGER NIDMAX, NALMAX, NBEMAX
+ PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 )
+* .. Local Scalars ..
+ DOUBLE PRECISION EPS, ERR, THRESH
+ INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA
+ LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE,
+ $ TSTERR
+ CHARACTER*1 TRANSA, TRANSB
+ CHARACTER*8 SNAMET
+ CHARACTER*32 SNAPS, SUMMRY
+* .. Local Arrays ..
+ COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ),
+ $ ALF( NALMAX ), AS( NMAX*NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBEMAX ),
+ $ BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDMAX )
+ LOGICAL LTEST( NSUBS )
+ CHARACTER*8 SNAMES( NSUBS )
+* .. External Functions ..
+ DOUBLE PRECISION DDIFF
+ LOGICAL LZE
+ EXTERNAL DDIFF, LZE
+* .. External Subroutines ..
+ EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHKE, ZMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX, MIN
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+ CHARACTER*8 SRNAMT
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+ COMMON /SRNAMC/SRNAMT
+* .. Data statements ..
+ DATA SNAMES/'ZGEMM3M ', 'ZHEMM ', 'ZSYMM ',
+ $ 'ZTRMM ',
+ $ 'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K',
+ $ 'ZSYR2K'/
+* .. Executable Statements ..
+*
+* Read name and unit number for summary output file and open file.
+*
+ READ( NIN, FMT = * )SUMMRY
+ READ( NIN, FMT = * )NOUT
+ OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' )
+ NOUTC = NOUT
+*
+* Read name and unit number for snapshot output file and open file.
+*
+ READ( NIN, FMT = * )SNAPS
+ READ( NIN, FMT = * )NTRA
+ TRACE = NTRA.GE.0
+ IF( TRACE )THEN
+ OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' )
+ END IF
+* Read the flag that directs rewinding of the snapshot file.
+ READ( NIN, FMT = * )REWI
+ REWI = REWI.AND.TRACE
+* Read the flag that directs stopping on any failure.
+ READ( NIN, FMT = * )SFATAL
+* Read the flag that indicates whether error exits are to be tested.
+ READ( NIN, FMT = * )TSTERR
+* Read the threshold value of the test ratio
+ READ( NIN, FMT = * )THRESH
+*
+* Read and check the parameter values for the tests.
+*
+* Values of N
+ READ( NIN, FMT = * )NIDIM
+ IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'N', NIDMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM )
+ DO 10 I = 1, NIDIM
+ IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN
+ WRITE( NOUT, FMT = 9996 )NMAX
+ GO TO 220
+ END IF
+ 10 CONTINUE
+* Values of ALPHA
+ READ( NIN, FMT = * )NALF
+ IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( ALF( I ), I = 1, NALF )
+* Values of BETA
+ READ( NIN, FMT = * )NBET
+ IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN
+ WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX
+ GO TO 220
+ END IF
+ READ( NIN, FMT = * )( BET( I ), I = 1, NBET )
+*
+* Report values of parameters.
+*
+ WRITE( NOUT, FMT = 9995 )
+ WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM )
+ WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF )
+ WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET )
+ IF( .NOT.TSTERR )THEN
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9984 )
+ END IF
+ WRITE( NOUT, FMT = * )
+ WRITE( NOUT, FMT = 9999 )THRESH
+ WRITE( NOUT, FMT = * )
+*
+* Read names of subroutines and flags which indicate
+* whether they are to be tested.
+*
+ DO 20 I = 1, NSUBS
+ LTEST( I ) = .FALSE.
+ 20 CONTINUE
+ 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT
+ DO 40 I = 1, NSUBS
+ IF( SNAMET.EQ.SNAMES( I ) )
+ $ GO TO 50
+ 40 CONTINUE
+ WRITE( NOUT, FMT = 9990 )SNAMET
+ STOP
+ 50 LTEST( I ) = LTESTT
+ GO TO 30
+*
+ 60 CONTINUE
+ CLOSE ( NIN )
+*
+* Compute EPS (the machine precision).
+*
+ EPS = RONE
+ 70 CONTINUE
+ IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO )
+ $ GO TO 80
+ EPS = RHALF*EPS
+ GO TO 70
+ 80 CONTINUE
+ EPS = EPS + EPS
+ WRITE( NOUT, FMT = 9998 )EPS
+*
+* Check the reliability of ZMMCH using exact data.
+*
+ N = MIN( 32, NMAX )
+ DO 100 J = 1, N
+ DO 90 I = 1, N
+ AB( I, J ) = MAX( I - J + 1, 0 )
+ 90 CONTINUE
+ AB( J, NMAX + 1 ) = J
+ AB( 1, NMAX + J ) = J
+ C( J, 1 ) = ZERO
+ 100 CONTINUE
+ DO 110 J = 1, N
+ CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3
+ 110 CONTINUE
+* CC holds the exact result. On exit from ZMMCH CT holds
+* the result computed by ZMMCH.
+ TRANSA = 'N'
+ TRANSB = 'N'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ DO 120 J = 1, N
+ AB( J, NMAX + 1 ) = N - J + 1
+ AB( 1, NMAX + J ) = N - J + 1
+ 120 CONTINUE
+ DO 130 J = 1, N
+ CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 -
+ $ ( ( J + 1 )*J*( J - 1 ) )/3
+ 130 CONTINUE
+ TRANSA = 'C'
+ TRANSB = 'N'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+ TRANSB = 'C'
+ CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX,
+ $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC,
+ $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. )
+ SAME = LZE( CC, CT, N )
+ IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN
+ WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR
+ STOP
+ END IF
+*
+* Test each subroutine in turn.
+*
+ DO 200 ISNUM = 1, NSUBS
+ WRITE( NOUT, FMT = * )
+ IF( .NOT.LTEST( ISNUM ) )THEN
+* Subprogram is not to be tested.
+ WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM )
+ ELSE
+ SRNAMT = SNAMES( ISNUM )
+* Test error exits.
+ IF( TSTERR )THEN
+ CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT )
+ WRITE( NOUT, FMT = * )
+ END IF
+* Test computations.
+ INFOT = 0
+ OK = .TRUE.
+ FATAL = .FALSE.
+ GO TO ( 140, 150, 150, 160, 160, 170, 170,
+ $ 180, 180 )ISNUM
+* Test ZGEMM3M, 01.
+ 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G )
+ GO TO 190
+* Test ZHEMM, 02, ZSYMM, 03.
+ 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G )
+ GO TO 190
+* Test ZTRMM, 04, ZTRSM, 05.
+ 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB,
+ $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C )
+ GO TO 190
+* Test ZHERK, 06, ZSYRK, 07.
+ 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C,
+ $ CC, CS, CT, G )
+ GO TO 190
+* Test ZHER2K, 08, ZSYR2K, 09.
+ 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE,
+ $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET,
+ $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+ GO TO 190
+*
+ 190 IF( FATAL.AND.SFATAL )
+ $ GO TO 210
+ END IF
+ 200 CONTINUE
+ WRITE( NOUT, FMT = 9986 )
+ GO TO 230
+*
+ 210 CONTINUE
+ WRITE( NOUT, FMT = 9985 )
+ GO TO 230
+*
+ 220 CONTINUE
+ WRITE( NOUT, FMT = 9991 )
+*
+ 230 CONTINUE
+ IF( TRACE )
+ $ CLOSE ( NTRA )
+ CLOSE ( NOUT )
+ STOP
+*
+ 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES',
+ $ 'S THAN', F8.2 )
+ 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 )
+ 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ',
+ $ 'THAN ', I2 )
+ 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 )
+ 9995 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F',
+ $ 'OLLOWING PARAMETER VALUES WILL BE USED:' )
+ 9994 FORMAT( ' FOR N ', 9I6 )
+ 9993 FORMAT( ' FOR ALPHA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9992 FORMAT( ' FOR BETA ',
+ $ 7( '(', F4.1, ',', F4.1, ') ', : ) )
+ 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM',
+ $ /' ******* TESTS ABANDONED *******' )
+ 9990 FORMAT( ' SUBPROGRAM NAME ', A8, ' NOT RECOGNIZED', /' ******* T',
+ $ 'ESTS ABANDONED *******' )
+ 9989 FORMAT( ' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU',
+ $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1,
+ $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ',
+ $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ',
+ $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ',
+ $ '*******' )
+ 9988 FORMAT( A8, L2 )
+ 9987 FORMAT( 1X, A8, ' WAS NOT TESTED' )
+ 9986 FORMAT( /' END OF TESTS' )
+ 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' )
+ 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' )
+*
+* End of ZBLAT3.
+*
+ END
+ SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+* Tests ZGEMM3M.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BLS
+ DOUBLE PRECISION ERR, ERRMAX
+ INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA,
+ $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M,
+ $ MA, MB, MS, N, NA, NARGS, NB, NC, NS
+ LOGICAL NULL, RESET, SAME, TRANA, TRANB
+ CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB
+ CHARACTER*3 ICH
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL ZGEMM3M, ZMAKE, ZMMCH
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICH/'NTC'/
+* .. Executable Statements ..
+*
+ NARGS = 13
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 110 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICA = 1, 3
+ TRANSA = ICH( ICA: ICA )
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+*
+ IF( TRANA )THEN
+ MA = K
+ NA = M
+ ELSE
+ MA = M
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICB = 1, 3
+ TRANSB = ICH( ICB: ICB )
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+*
+ IF( TRANB )THEN
+ MB = N
+ NB = K
+ ELSE
+ MB = K
+ NB = N
+ END IF
+* Set LDB to 1 more than minimum value if room.
+ LDB = MB
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 70
+ LBB = LDB*NB
+*
+* Generate the matrix B.
+*
+ CALL ZMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB,
+ $ LDB, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX,
+ $ CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ TRANAS = TRANSA
+ TRANBS = TRANSB
+ MS = M
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME,
+ $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB,
+ $ BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL ZGEMM3M( TRANSA, TRANSB, M, N, K, ALPHA,
+ $ AA, LDA, BB, LDB, BETA, CC, LDC )
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = TRANSA.EQ.TRANAS
+ ISAME( 2 ) = TRANSB.EQ.TRANBS
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = KS.EQ.K
+ ISAME( 6 ) = ALS.EQ.ALPHA
+ ISAME( 7 ) = LZE( AS, AA, LAA )
+ ISAME( 8 ) = LDAS.EQ.LDA
+ ISAME( 9 ) = LZE( BS, BB, LBB )
+ ISAME( 10 ) = LDBS.EQ.LDB
+ ISAME( 11 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 12 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 12 ) = LZERES( 'GE', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 13 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report
+* and return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ CALL ZMMCH( TRANSA, TRANSB, M, N, K,
+ $ ALPHA, A, NMAX, B, NMAX, BETA,
+ $ C, NMAX, CT, G, CC, LDC, EPS,
+ $ ERR, FATAL, NOUT, .TRUE. )
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 120
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K,
+ $ ALPHA, LDA, LDB, BETA, LDC
+*
+ 130 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A8, '(''', A1, ''',''', A1, ''',',
+ $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3,
+ $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK1.
+*
+ END
+ SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+* Tests ZHEMM and ZSYMM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BLS
+ DOUBLE PRECISION ERR, ERRMAX
+ INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC,
+ $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, LEFT, NULL, RESET, SAME
+ CHARACTER*1 SIDE, SIDES, UPLO, UPLOS
+ CHARACTER*2 ICHS, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL ZHEMM, ZMAKE, ZMMCH, ZSYMM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHS/'LR'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 100 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 90 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = M
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 90
+ LCC = LDC*N
+ NULL = N.LE.0.OR.M.LE.0
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 90
+ LBB = LDB*N
+*
+* Generate the matrix B.
+*
+ CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET,
+ $ ZERO )
+*
+ DO 80 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+*
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+* Generate the hermitian or symmetric matrix A.
+*
+ CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX,
+ $ AA, LDA, RESET, ZERO )
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC,
+ $ LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ BLS = BETA
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE,
+ $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ IF( CONJ )THEN
+ CALL ZHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+ $ BB, LDB, BETA, CC, LDC )
+ ELSE
+ CALL ZSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA,
+ $ BB, LDB, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = MS.EQ.M
+ ISAME( 4 ) = NS.EQ.N
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LZE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LZE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ ISAME( 10 ) = BLS.EQ.BETA
+ IF( NULL )THEN
+ ISAME( 11 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LZERES( 'GE', ' ', M, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 110
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A,
+ $ NMAX, B, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B,
+ $ NMAX, A, NMAX, BETA, C, NMAX,
+ $ CT, G, CC, LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 120
+*
+ 110 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA,
+ $ LDB, BETA, LDC
+*
+ 120 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK2.
+*
+ END
+ SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS,
+ $ B, BB, BS, CT, G, C )
+*
+* Tests ZTRMM and ZTRSM.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ $ ONE = ( 1.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS
+ DOUBLE PRECISION ERR, ERRMAX
+ INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB,
+ $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC,
+ $ NS
+ LOGICAL LEFT, NULL, RESET, SAME
+ CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO,
+ $ UPLOS
+ CHARACTER*2 ICHD, ICHS, ICHU
+ CHARACTER*3 ICHT
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL ZMAKE, ZMMCH, ZTRMM, ZTRSM
+* .. Intrinsic Functions ..
+ INTRINSIC MAX
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/
+* .. Executable Statements ..
+*
+ NARGS = 11
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+* Set up zero matrix for ZMMCH.
+ DO 20 J = 1, NMAX
+ DO 10 I = 1, NMAX
+ C( I, J ) = ZERO
+ 10 CONTINUE
+ 20 CONTINUE
+*
+ DO 140 IM = 1, NIDIM
+ M = IDIM( IM )
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDB to 1 more than minimum value if room.
+ LDB = M
+ IF( LDB.LT.NMAX )
+ $ LDB = LDB + 1
+* Skip tests if not enough room.
+ IF( LDB.GT.NMAX )
+ $ GO TO 130
+ LBB = LDB*N
+ NULL = M.LE.0.OR.N.LE.0
+*
+ DO 120 ICS = 1, 2
+ SIDE = ICHS( ICS: ICS )
+ LEFT = SIDE.EQ.'L'
+ IF( LEFT )THEN
+ NA = M
+ ELSE
+ NA = N
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = NA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 130
+ LAA = LDA*NA
+*
+ DO 110 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+*
+ DO 100 ICT = 1, 3
+ TRANSA = ICHT( ICT: ICT )
+*
+ DO 90 ICD = 1, 2
+ DIAG = ICHD( ICD: ICD )
+*
+ DO 80 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+* Generate the matrix A.
+*
+ CALL ZMAKE( 'TR', UPLO, DIAG, NA, NA, A,
+ $ NMAX, AA, LDA, RESET, ZERO )
+*
+* Generate the matrix B.
+*
+ CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX,
+ $ BB, LDB, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the
+* subroutine.
+*
+ SIDES = SIDE
+ UPLOS = UPLO
+ TRANAS = TRANSA
+ DIAGS = DIAG
+ MS = M
+ NS = N
+ ALS = ALPHA
+ DO 30 I = 1, LAA
+ AS( I ) = AA( I )
+ 30 CONTINUE
+ LDAS = LDA
+ DO 40 I = 1, LBB
+ BS( I ) = BB( I )
+ 40 CONTINUE
+ LDBS = LDB
+*
+* Call the subroutine.
+*
+ IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB
+ IF( REWI )
+ $ REWIND NTRA
+ CALL ZTRMM( SIDE, UPLO, TRANSA, DIAG, M,
+ $ N, ALPHA, AA, LDA, BB, LDB )
+ ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9995 )NC, SNAME,
+ $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA,
+ $ LDA, LDB
+ IF( REWI )
+ $ REWIND NTRA
+ CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M,
+ $ N, ALPHA, AA, LDA, BB, LDB )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9994 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = SIDES.EQ.SIDE
+ ISAME( 2 ) = UPLOS.EQ.UPLO
+ ISAME( 3 ) = TRANAS.EQ.TRANSA
+ ISAME( 4 ) = DIAGS.EQ.DIAG
+ ISAME( 5 ) = MS.EQ.M
+ ISAME( 6 ) = NS.EQ.N
+ ISAME( 7 ) = ALS.EQ.ALPHA
+ ISAME( 8 ) = LZE( AS, AA, LAA )
+ ISAME( 9 ) = LDAS.EQ.LDA
+ IF( NULL )THEN
+ ISAME( 10 ) = LZE( BS, BB, LBB )
+ ELSE
+ ISAME( 10 ) = LZERES( 'GE', ' ', M, N, BS,
+ $ BB, LDB )
+ END IF
+ ISAME( 11 ) = LDBS.EQ.LDB
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 50 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 50 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+ IF( SNAME( 4: 5 ).EQ.'MM' )THEN
+*
+* Check the result.
+*
+ IF( LEFT )THEN
+ CALL ZMMCH( TRANSA, 'N', M, N, M,
+ $ ALPHA, A, NMAX, B, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL ZMMCH( 'N', TRANSA, M, N, N,
+ $ ALPHA, B, NMAX, A, NMAX,
+ $ ZERO, C, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN
+*
+* Compute approximation to original
+* matrix.
+*
+ DO 70 J = 1, N
+ DO 60 I = 1, M
+ C( I, J ) = BB( I + ( J - 1 )*
+ $ LDB )
+ BB( I + ( J - 1 )*LDB ) = ALPHA*
+ $ B( I, J )
+ 60 CONTINUE
+ 70 CONTINUE
+*
+ IF( LEFT )THEN
+ CALL ZMMCH( TRANSA, 'N', M, N, M,
+ $ ONE, A, NMAX, C, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ ELSE
+ CALL ZMMCH( 'N', TRANSA, M, N, N,
+ $ ONE, C, NMAX, A, NMAX,
+ $ ZERO, B, NMAX, CT, G,
+ $ BB, LDB, EPS, ERR,
+ $ FATAL, NOUT, .FALSE. )
+ END IF
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 150
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+ 140 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M,
+ $ N, ALPHA, LDA, LDB
+*
+ 160 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( 1X, I6, ': ', A8, '(', 4( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ',
+ $ ' .' )
+ 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK3.
+*
+ END
+ SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G )
+*
+* Tests ZHERK and ZSYRK.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RONE, RZERO
+ PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ),
+ $ AS( NMAX*NMAX ), B( NMAX, NMAX ),
+ $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ),
+ $ C( NMAX, NMAX ), CC( NMAX*NMAX ),
+ $ CS( NMAX*NMAX ), CT( NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BETS
+ DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS,
+ $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA,
+ $ NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL ZHERK, ZMAKE, ZMMCH, ZSYRK
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX, MAX, DBLE
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+ NARGS = 10
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+ RALS = RONE
+ RBETS = RONE
+*
+ DO 100 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 100
+ LCC = LDC*N
+*
+ DO 90 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 80 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 80
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA,
+ $ RESET, ZERO )
+*
+ DO 70 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 60 IA = 1, NALF
+ ALPHA = ALF( IA )
+ IF( CONJ )THEN
+ RALPHA = DBLE( ALPHA )
+ ALPHA = DCMPLX( RALPHA, RZERO )
+ END IF
+*
+ DO 50 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = DBLE( BETA )
+ BETA = DCMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ.
+ $ RZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ IF( CONJ )THEN
+ RALS = RALPHA
+ ELSE
+ ALS = ALPHA
+ END IF
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 20 I = 1, LCC
+ CS( I ) = CC( I )
+ 20 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+ $ TRANS, N, K, RALPHA, LDA, RBETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL ZHERK( UPLO, TRANS, N, K, RALPHA, AA,
+ $ LDA, RBETA, CC, LDC )
+ ELSE
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+ $ TRANS, N, K, ALPHA, LDA, BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL ZSYRK( UPLO, TRANS, N, K, ALPHA, AA,
+ $ LDA, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ IF( CONJ )THEN
+ ISAME( 5 ) = RALS.EQ.RALPHA
+ ELSE
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ END IF
+ ISAME( 6 ) = LZE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ IF( CONJ )THEN
+ ISAME( 8 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 8 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 9 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 9 ) = LZERES( SNAME( 2: 3 ), UPLO, N,
+ $ N, CS, CC, LDC )
+ END IF
+ ISAME( 10 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 30 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 30 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 120
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JC = 1
+ DO 40 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ CALL ZMMCH( TRANST, 'N', LJ, 1, K,
+ $ ALPHA, A( 1, JJ ), NMAX,
+ $ A( 1, J ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ ELSE
+ CALL ZMMCH( 'N', TRANST, LJ, 1, K,
+ $ ALPHA, A( JJ, 1 ), NMAX,
+ $ A( J, 1 ), NMAX, BETA,
+ $ C( JJ, J ), NMAX, CT, G,
+ $ CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 110
+ 40 CONTINUE
+ END IF
+*
+ 50 CONTINUE
+*
+ 60 CONTINUE
+*
+ 70 CONTINUE
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 130
+*
+ 110 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 120 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA,
+ $ LDA, RBETA, LDC
+ ELSE
+ WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, BETA, LDC
+ END IF
+*
+ 130 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ',
+ $ ' .' )
+ 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1,
+ $ '), C,', I3, ') .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK4.
+*
+ END
+ SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI,
+ $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX,
+ $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W )
+*
+* Tests ZHER2K and ZSYR2K.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ $ ONE = ( 1.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RONE, RZERO
+ PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 )
+* .. Scalar Arguments ..
+ DOUBLE PRECISION EPS, THRESH
+ INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA
+ LOGICAL FATAL, REWI, TRACE
+ CHARACTER*8 SNAME
+* .. Array Arguments ..
+ COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ),
+ $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ),
+ $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ),
+ $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ),
+ $ W( 2*NMAX )
+ DOUBLE PRECISION G( NMAX )
+ INTEGER IDIM( NIDIM )
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, ALS, BETA, BETS
+ DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS
+ INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB,
+ $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS,
+ $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS
+ LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER
+ CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS
+ CHARACTER*2 ICHT, ICHU
+* .. Local Arrays ..
+ LOGICAL ISAME( 13 )
+* .. External Functions ..
+ LOGICAL LZE, LZERES
+ EXTERNAL LZE, LZERES
+* .. External Subroutines ..
+ EXTERNAL ZHER2K, ZMAKE, ZMMCH, ZSYR2K
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX, DCONJG, MAX, DBLE
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Data statements ..
+ DATA ICHT/'NC'/, ICHU/'UL'/
+* .. Executable Statements ..
+ CONJ = SNAME( 2: 3 ).EQ.'HE'
+*
+ NARGS = 12
+ NC = 0
+ RESET = .TRUE.
+ ERRMAX = RZERO
+*
+ DO 130 IN = 1, NIDIM
+ N = IDIM( IN )
+* Set LDC to 1 more than minimum value if room.
+ LDC = N
+ IF( LDC.LT.NMAX )
+ $ LDC = LDC + 1
+* Skip tests if not enough room.
+ IF( LDC.GT.NMAX )
+ $ GO TO 130
+ LCC = LDC*N
+*
+ DO 120 IK = 1, NIDIM
+ K = IDIM( IK )
+*
+ DO 110 ICT = 1, 2
+ TRANS = ICHT( ICT: ICT )
+ TRAN = TRANS.EQ.'C'
+ IF( TRAN.AND..NOT.CONJ )
+ $ TRANS = 'T'
+ IF( TRAN )THEN
+ MA = K
+ NA = N
+ ELSE
+ MA = N
+ NA = K
+ END IF
+* Set LDA to 1 more than minimum value if room.
+ LDA = MA
+ IF( LDA.LT.NMAX )
+ $ LDA = LDA + 1
+* Skip tests if not enough room.
+ IF( LDA.GT.NMAX )
+ $ GO TO 110
+ LAA = LDA*NA
+*
+* Generate the matrix A.
+*
+ IF( TRAN )THEN
+ CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA,
+ $ LDA, RESET, ZERO )
+ ELSE
+ CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA,
+ $ RESET, ZERO )
+ END IF
+*
+* Generate the matrix B.
+*
+ LDB = LDA
+ LBB = LAA
+ IF( TRAN )THEN
+ CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ),
+ $ 2*NMAX, BB, LDB, RESET, ZERO )
+ ELSE
+ CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ),
+ $ NMAX, BB, LDB, RESET, ZERO )
+ END IF
+*
+ DO 100 ICU = 1, 2
+ UPLO = ICHU( ICU: ICU )
+ UPPER = UPLO.EQ.'U'
+*
+ DO 90 IA = 1, NALF
+ ALPHA = ALF( IA )
+*
+ DO 80 IB = 1, NBET
+ BETA = BET( IB )
+ IF( CONJ )THEN
+ RBETA = DBLE( BETA )
+ BETA = DCMPLX( RBETA, RZERO )
+ END IF
+ NULL = N.LE.0
+ IF( CONJ )
+ $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ.
+ $ ZERO ).AND.RBETA.EQ.RONE )
+*
+* Generate the matrix C.
+*
+ CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C,
+ $ NMAX, CC, LDC, RESET, ZERO )
+*
+ NC = NC + 1
+*
+* Save every datum before calling the subroutine.
+*
+ UPLOS = UPLO
+ TRANSS = TRANS
+ NS = N
+ KS = K
+ ALS = ALPHA
+ DO 10 I = 1, LAA
+ AS( I ) = AA( I )
+ 10 CONTINUE
+ LDAS = LDA
+ DO 20 I = 1, LBB
+ BS( I ) = BB( I )
+ 20 CONTINUE
+ LDBS = LDB
+ IF( CONJ )THEN
+ RBETS = RBETA
+ ELSE
+ BETS = BETA
+ END IF
+ DO 30 I = 1, LCC
+ CS( I ) = CC( I )
+ 30 CONTINUE
+ LDCS = LDC
+*
+* Call the subroutine.
+*
+ IF( CONJ )THEN
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO,
+ $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL ZHER2K( UPLO, TRANS, N, K, ALPHA, AA,
+ $ LDA, BB, LDB, RBETA, CC, LDC )
+ ELSE
+ IF( TRACE )
+ $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO,
+ $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC
+ IF( REWI )
+ $ REWIND NTRA
+ CALL ZSYR2K( UPLO, TRANS, N, K, ALPHA, AA,
+ $ LDA, BB, LDB, BETA, CC, LDC )
+ END IF
+*
+* Check if error-exit was taken incorrectly.
+*
+ IF( .NOT.OK )THEN
+ WRITE( NOUT, FMT = 9992 )
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+* See what data changed inside subroutines.
+*
+ ISAME( 1 ) = UPLOS.EQ.UPLO
+ ISAME( 2 ) = TRANSS.EQ.TRANS
+ ISAME( 3 ) = NS.EQ.N
+ ISAME( 4 ) = KS.EQ.K
+ ISAME( 5 ) = ALS.EQ.ALPHA
+ ISAME( 6 ) = LZE( AS, AA, LAA )
+ ISAME( 7 ) = LDAS.EQ.LDA
+ ISAME( 8 ) = LZE( BS, BB, LBB )
+ ISAME( 9 ) = LDBS.EQ.LDB
+ IF( CONJ )THEN
+ ISAME( 10 ) = RBETS.EQ.RBETA
+ ELSE
+ ISAME( 10 ) = BETS.EQ.BETA
+ END IF
+ IF( NULL )THEN
+ ISAME( 11 ) = LZE( CS, CC, LCC )
+ ELSE
+ ISAME( 11 ) = LZERES( 'HE', UPLO, N, N, CS,
+ $ CC, LDC )
+ END IF
+ ISAME( 12 ) = LDCS.EQ.LDC
+*
+* If data was incorrectly changed, report and
+* return.
+*
+ SAME = .TRUE.
+ DO 40 I = 1, NARGS
+ SAME = SAME.AND.ISAME( I )
+ IF( .NOT.ISAME( I ) )
+ $ WRITE( NOUT, FMT = 9998 )I
+ 40 CONTINUE
+ IF( .NOT.SAME )THEN
+ FATAL = .TRUE.
+ GO TO 150
+ END IF
+*
+ IF( .NOT.NULL )THEN
+*
+* Check the result column by column.
+*
+ IF( CONJ )THEN
+ TRANST = 'C'
+ ELSE
+ TRANST = 'T'
+ END IF
+ JJAB = 1
+ JC = 1
+ DO 70 J = 1, N
+ IF( UPPER )THEN
+ JJ = 1
+ LJ = J
+ ELSE
+ JJ = J
+ LJ = N - J + 1
+ END IF
+ IF( TRAN )THEN
+ DO 50 I = 1, K
+ W( I ) = ALPHA*AB( ( J - 1 )*2*
+ $ NMAX + K + I )
+ IF( CONJ )THEN
+ W( K + I ) = DCONJG( ALPHA )*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ ELSE
+ W( K + I ) = ALPHA*
+ $ AB( ( J - 1 )*2*
+ $ NMAX + I )
+ END IF
+ 50 CONTINUE
+ CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K,
+ $ ONE, AB( JJAB ), 2*NMAX, W,
+ $ 2*NMAX, BETA, C( JJ, J ),
+ $ NMAX, CT, G, CC( JC ), LDC,
+ $ EPS, ERR, FATAL, NOUT,
+ $ .TRUE. )
+ ELSE
+ DO 60 I = 1, K
+ IF( CONJ )THEN
+ W( I ) = ALPHA*DCONJG( AB( ( K +
+ $ I - 1 )*NMAX + J ) )
+ W( K + I ) = DCONJG( ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J ) )
+ ELSE
+ W( I ) = ALPHA*AB( ( K + I - 1 )*
+ $ NMAX + J )
+ W( K + I ) = ALPHA*
+ $ AB( ( I - 1 )*NMAX +
+ $ J )
+ END IF
+ 60 CONTINUE
+ CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE,
+ $ AB( JJ ), NMAX, W, 2*NMAX,
+ $ BETA, C( JJ, J ), NMAX, CT,
+ $ G, CC( JC ), LDC, EPS, ERR,
+ $ FATAL, NOUT, .TRUE. )
+ END IF
+ IF( UPPER )THEN
+ JC = JC + LDC
+ ELSE
+ JC = JC + LDC + 1
+ IF( TRAN )
+ $ JJAB = JJAB + 2*NMAX
+ END IF
+ ERRMAX = MAX( ERRMAX, ERR )
+* If got really bad answer, report and
+* return.
+ IF( FATAL )
+ $ GO TO 140
+ 70 CONTINUE
+ END IF
+*
+ 80 CONTINUE
+*
+ 90 CONTINUE
+*
+ 100 CONTINUE
+*
+ 110 CONTINUE
+*
+ 120 CONTINUE
+*
+ 130 CONTINUE
+*
+* Report result.
+*
+ IF( ERRMAX.LT.THRESH )THEN
+ WRITE( NOUT, FMT = 9999 )SNAME, NC
+ ELSE
+ WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX
+ END IF
+ GO TO 160
+*
+ 140 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9995 )J
+*
+ 150 CONTINUE
+ WRITE( NOUT, FMT = 9996 )SNAME
+ IF( CONJ )THEN
+ WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, LDB, RBETA, LDC
+ ELSE
+ WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA,
+ $ LDA, LDB, BETA, LDC
+ END IF
+*
+ 160 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL',
+ $ 'S)' )
+ 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH',
+ $ 'ANGED INCORRECTLY *******' )
+ 9997 FORMAT( ' ', A8, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C',
+ $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2,
+ $ ' - SUSPECT *******' )
+ 9996 FORMAT( ' ******* ', A8, ' FAILED ON CALL NUMBER:' )
+ 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+ 9994 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1,
+ $ ', C,', I3, ') .' )
+ 9993 FORMAT( 1X, I6, ': ', A8, '(', 2( '''', A1, ''',' ), 2( I3, ',' ),
+ $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1,
+ $ ',', F4.1, '), C,', I3, ') .' )
+ 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *',
+ $ '******' )
+*
+* End of ZCHK5.
+*
+ END
+ SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT )
+*
+* Tests the error exits from the Level 3 Blas.
+* Requires a special version of the error-handling routine XERBLA.
+* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER ISNUM, NOUT
+ CHARACTER*8 SRNAMT
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUTC
+ LOGICAL LERR, OK
+* .. Local Scalars ..
+ COMPLEX*16 ALPHA, BETA
+ DOUBLE PRECISION RALPHA, RBETA
+* .. Local Arrays ..
+ COMPLEX*16 A( 2, 1 ), B( 2, 1 ), C( 2, 1 )
+* .. External Subroutines ..
+ EXTERNAL ZGEMM3M, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM,
+ $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUTC, OK, LERR
+* .. Executable Statements ..
+* OK is set to .FALSE. by the special version of XERBLA or by CHKXER
+* if anything is wrong.
+ OK = .TRUE.
+* LERR is set to .TRUE. by the special version of XERBLA each time
+* it is called, and is then tested and re-set by CHKXER.
+ LERR = .FALSE.
+ GO TO ( 10, 20, 30, 40, 50, 60, 70, 80,
+ $ 90 )ISNUM
+ 10 INFOT = 1
+ CALL ZGEMM3M( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 1
+ CALL ZGEMM3M( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 1
+ CALL ZGEMM3M( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZGEMM3M( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZGEMM3M( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZGEMM3M( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZGEMM3M( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZGEMM3M( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZGEMM3M( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 8
+ CALL ZGEMM3M( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZGEMM3M( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 13
+ CALL ZGEMM3M( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 20 INFOT = 1
+ CALL ZHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 30 INFOT = 1
+ CALL ZSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 40 INFOT = 1
+ CALL ZTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 50 INFOT = 1
+ CALL ZTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 5
+ CALL ZTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 6
+ CALL ZTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 11
+ CALL ZTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 60 INFOT = 1
+ CALL ZHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 70 INFOT = 1
+ CALL ZSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 10
+ CALL ZSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 80 INFOT = 1
+ CALL ZHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ GO TO 100
+ 90 INFOT = 1
+ CALL ZSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 2
+ CALL ZSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 3
+ CALL ZSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 4
+ CALL ZSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 7
+ CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 9
+ CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+ INFOT = 12
+ CALL ZSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 )
+ CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+ 100 IF( OK )THEN
+ WRITE( NOUT, FMT = 9999 )SRNAMT
+ ELSE
+ WRITE( NOUT, FMT = 9998 )SRNAMT
+ END IF
+ RETURN
+*
+ 9999 FORMAT( ' ', A8, ' PASSED THE TESTS OF ERROR-EXITS' )
+ 9998 FORMAT( ' ******* ', A8, ' FAILED THE TESTS OF ERROR-EXITS *****',
+ $ '**' )
+*
+* End of ZCHKE.
+*
+ END
+ SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET,
+ $ TRANSL )
+*
+* Generates values for an M by N matrix A.
+* Stores the values in the array AA in the data structure required
+* by the routine, with unwanted elements set to rogue value.
+*
+* TYPE is 'GE', 'HE', 'SY' or 'TR'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO, ONE
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ),
+ $ ONE = ( 1.0D0, 0.0D0 ) )
+ COMPLEX*16 ROGUE
+ PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) )
+ DOUBLE PRECISION RZERO
+ PARAMETER ( RZERO = 0.0D0 )
+ DOUBLE PRECISION RROGUE
+ PARAMETER ( RROGUE = -1.0D10 )
+* .. Scalar Arguments ..
+ COMPLEX*16 TRANSL
+ INTEGER LDA, M, N, NMAX
+ LOGICAL RESET
+ CHARACTER*1 DIAG, UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX*16 A( NMAX, * ), AA( * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J, JJ
+ LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER
+* .. External Functions ..
+ COMPLEX*16 ZBEG
+ EXTERNAL ZBEG
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX, DCONJG, DBLE
+* .. Executable Statements ..
+ GEN = TYPE.EQ.'GE'
+ HER = TYPE.EQ.'HE'
+ SYM = TYPE.EQ.'SY'
+ TRI = TYPE.EQ.'TR'
+ UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U'
+ LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L'
+ UNIT = TRI.AND.DIAG.EQ.'U'
+*
+* Generate data in array A.
+*
+ DO 20 J = 1, N
+ DO 10 I = 1, M
+ IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) )
+ $ THEN
+ A( I, J ) = ZBEG( RESET ) + TRANSL
+ IF( I.NE.J )THEN
+* Set some elements to zero
+ IF( N.GT.3.AND.J.EQ.N/2 )
+ $ A( I, J ) = ZERO
+ IF( HER )THEN
+ A( J, I ) = DCONJG( A( I, J ) )
+ ELSE IF( SYM )THEN
+ A( J, I ) = A( I, J )
+ ELSE IF( TRI )THEN
+ A( J, I ) = ZERO
+ END IF
+ END IF
+ END IF
+ 10 CONTINUE
+ IF( HER )
+ $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO )
+ IF( TRI )
+ $ A( J, J ) = A( J, J ) + ONE
+ IF( UNIT )
+ $ A( J, J ) = ONE
+ 20 CONTINUE
+*
+* Store elements in array AS in data structure required by routine.
+*
+ IF( TYPE.EQ.'GE' )THEN
+ DO 50 J = 1, N
+ DO 30 I = 1, M
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 30 CONTINUE
+ DO 40 I = M + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN
+ DO 90 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IF( UNIT )THEN
+ IEND = J - 1
+ ELSE
+ IEND = J
+ END IF
+ ELSE
+ IF( UNIT )THEN
+ IBEG = J + 1
+ ELSE
+ IBEG = J
+ END IF
+ IEND = N
+ END IF
+ DO 60 I = 1, IBEG - 1
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 60 CONTINUE
+ DO 70 I = IBEG, IEND
+ AA( I + ( J - 1 )*LDA ) = A( I, J )
+ 70 CONTINUE
+ DO 80 I = IEND + 1, LDA
+ AA( I + ( J - 1 )*LDA ) = ROGUE
+ 80 CONTINUE
+ IF( HER )THEN
+ JJ = J + ( J - 1 )*LDA
+ AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE )
+ END IF
+ 90 CONTINUE
+ END IF
+ RETURN
+*
+* End of ZMAKE.
+*
+ END
+ SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB,
+ $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL,
+ $ NOUT, MV )
+*
+* Checks the results of the computational tests.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Parameters ..
+ COMPLEX*16 ZERO
+ PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) )
+ DOUBLE PRECISION RZERO, RONE
+ PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 )
+* .. Scalar Arguments ..
+ COMPLEX*16 ALPHA, BETA
+ DOUBLE PRECISION EPS, ERR
+ INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT
+ LOGICAL FATAL, MV
+ CHARACTER*1 TRANSA, TRANSB
+* .. Array Arguments ..
+ COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ),
+ $ CC( LDCC, * ), CT( * )
+ DOUBLE PRECISION G( * )
+* .. Local Scalars ..
+ COMPLEX*16 CL
+ DOUBLE PRECISION ERRI
+ INTEGER I, J, K
+ LOGICAL CTRANA, CTRANB, TRANA, TRANB
+* .. Intrinsic Functions ..
+ INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT
+* .. Statement Functions ..
+ DOUBLE PRECISION ABS1
+* .. Statement Function definitions ..
+ ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) )
+* .. Executable Statements ..
+ TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C'
+ TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C'
+ CTRANA = TRANSA.EQ.'C'
+ CTRANB = TRANSB.EQ.'C'
+*
+* Compute expected result, one column at a time, in CT using data
+* in A, B and C.
+* Compute gauges in G.
+*
+ DO 220 J = 1, N
+*
+ DO 10 I = 1, M
+ CT( I ) = ZERO
+ G( I ) = RZERO
+ 10 CONTINUE
+ IF( .NOT.TRANA.AND..NOT.TRANB )THEN
+ DO 30 K = 1, KK
+ DO 20 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) )
+ 20 CONTINUE
+ 30 CONTINUE
+ ELSE IF( TRANA.AND..NOT.TRANB )THEN
+ IF( CTRANA )THEN
+ DO 50 K = 1, KK
+ DO 40 I = 1, M
+ CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 40 CONTINUE
+ 50 CONTINUE
+ ELSE
+ DO 70 K = 1, KK
+ DO 60 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( K, J )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( K, J ) )
+ 60 CONTINUE
+ 70 CONTINUE
+ END IF
+ ELSE IF( .NOT.TRANA.AND.TRANB )THEN
+ IF( CTRANB )THEN
+ DO 90 K = 1, KK
+ DO 80 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 80 CONTINUE
+ 90 CONTINUE
+ ELSE
+ DO 110 K = 1, KK
+ DO 100 I = 1, M
+ CT( I ) = CT( I ) + A( I, K )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( I, K ) )*
+ $ ABS1( B( J, K ) )
+ 100 CONTINUE
+ 110 CONTINUE
+ END IF
+ ELSE IF( TRANA.AND.TRANB )THEN
+ IF( CTRANA )THEN
+ IF( CTRANB )THEN
+ DO 130 K = 1, KK
+ DO 120 I = 1, M
+ CT( I ) = CT( I ) + DCONJG( A( K, I ) )*
+ $ DCONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 120 CONTINUE
+ 130 CONTINUE
+ ELSE
+ DO 150 K = 1, KK
+ DO 140 I = 1, M
+ CT( I ) = CT( I ) + DCONJG( A( K, I ) )*
+ $ B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 140 CONTINUE
+ 150 CONTINUE
+ END IF
+ ELSE
+ IF( CTRANB )THEN
+ DO 170 K = 1, KK
+ DO 160 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*
+ $ DCONJG( B( J, K ) )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 160 CONTINUE
+ 170 CONTINUE
+ ELSE
+ DO 190 K = 1, KK
+ DO 180 I = 1, M
+ CT( I ) = CT( I ) + A( K, I )*B( J, K )
+ G( I ) = G( I ) + ABS1( A( K, I ) )*
+ $ ABS1( B( J, K ) )
+ 180 CONTINUE
+ 190 CONTINUE
+ END IF
+ END IF
+ END IF
+ DO 200 I = 1, M
+ CT( I ) = ALPHA*CT( I ) + BETA*C( I, J )
+ G( I ) = ABS1( ALPHA )*G( I ) +
+ $ ABS1( BETA )*ABS1( C( I, J ) )
+ 200 CONTINUE
+*
+* Compute the error ratio for this result.
+*
+ ERR = ZERO
+ DO 210 I = 1, M
+ ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS
+ IF( G( I ).NE.RZERO )
+ $ ERRI = ERRI/G( I )
+ ERR = MAX( ERR, ERRI )
+ IF( ERR*SQRT( EPS ).GE.RONE )
+ $ GO TO 230
+ 210 CONTINUE
+*
+ 220 CONTINUE
+*
+* If the loop completes, all results are at least half accurate.
+ GO TO 250
+*
+* Report fatal error.
+*
+ 230 FATAL = .TRUE.
+ WRITE( NOUT, FMT = 9999 )
+ DO 240 I = 1, M
+ IF( MV )THEN
+ WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J )
+ ELSE
+ WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I )
+ END IF
+ 240 CONTINUE
+ IF( N.GT.1 )
+ $ WRITE( NOUT, FMT = 9997 )J
+*
+ 250 CONTINUE
+ RETURN
+*
+ 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL',
+ $ 'F ACCURATE *******', /' EXPECTED RE',
+ $ 'SULT COMPUTED RESULT' )
+ 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) )
+ 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 )
+*
+* End of ZMMCH.
+*
+ END
+ LOGICAL FUNCTION LZE( RI, RJ, LR )
+*
+* Tests if two arrays are identical.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LR
+* .. Array Arguments ..
+ COMPLEX*16 RI( * ), RJ( * )
+* .. Local Scalars ..
+ INTEGER I
+* .. Executable Statements ..
+ DO 10 I = 1, LR
+ IF( RI( I ).NE.RJ( I ) )
+ $ GO TO 20
+ 10 CONTINUE
+ LZE = .TRUE.
+ GO TO 30
+ 20 CONTINUE
+ LZE = .FALSE.
+ 30 RETURN
+*
+* End of LZE.
+*
+ END
+ LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA )
+*
+* Tests if selected elements in two arrays are equal.
+*
+* TYPE is 'GE' or 'HE' or 'SY'.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER LDA, M, N
+ CHARACTER*1 UPLO
+ CHARACTER*2 TYPE
+* .. Array Arguments ..
+ COMPLEX*16 AA( LDA, * ), AS( LDA, * )
+* .. Local Scalars ..
+ INTEGER I, IBEG, IEND, J
+ LOGICAL UPPER
+* .. Executable Statements ..
+ UPPER = UPLO.EQ.'U'
+ IF( TYPE.EQ.'GE' )THEN
+ DO 20 J = 1, N
+ DO 10 I = M + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 10 CONTINUE
+ 20 CONTINUE
+ ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN
+ DO 50 J = 1, N
+ IF( UPPER )THEN
+ IBEG = 1
+ IEND = J
+ ELSE
+ IBEG = J
+ IEND = N
+ END IF
+ DO 30 I = 1, IBEG - 1
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 30 CONTINUE
+ DO 40 I = IEND + 1, LDA
+ IF( AA( I, J ).NE.AS( I, J ) )
+ $ GO TO 70
+ 40 CONTINUE
+ 50 CONTINUE
+ END IF
+*
+ 60 CONTINUE
+ LZERES = .TRUE.
+ GO TO 80
+ 70 CONTINUE
+ LZERES = .FALSE.
+ 80 RETURN
+*
+* End of LZERES.
+*
+ END
+ COMPLEX*16 FUNCTION ZBEG( RESET )
+*
+* Generates complex numbers as pairs of random numbers uniformly
+* distributed between -0.5 and 0.5.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ LOGICAL RESET
+* .. Local Scalars ..
+ INTEGER I, IC, J, MI, MJ
+* .. Save statement ..
+ SAVE I, IC, J, MI, MJ
+* .. Intrinsic Functions ..
+ INTRINSIC DCMPLX
+* .. Executable Statements ..
+ IF( RESET )THEN
+* Initialize local variables.
+ MI = 891
+ MJ = 457
+ I = 7
+ J = 7
+ IC = 0
+ RESET = .FALSE.
+ END IF
+*
+* The sequence of values of I or J is bounded between 1 and 999.
+* If initial I or J = 1,2,3,6,7 or 9, the period will be 50.
+* If initial I or J = 4 or 8, the period will be 25.
+* If initial I or J = 5, the period will be 10.
+* IC is used to break up the period by skipping 1 value of I or J
+* in 6.
+*
+ IC = IC + 1
+ 10 I = I*MI
+ J = J*MJ
+ I = I - 1000*( I/1000 )
+ J = J - 1000*( J/1000 )
+ IF( IC.GE.5 )THEN
+ IC = 0
+ GO TO 10
+ END IF
+ ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 )
+ RETURN
+*
+* End of ZBEG.
+*
+ END
+ DOUBLE PRECISION FUNCTION DDIFF( X, Y )
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ DOUBLE PRECISION X, Y
+* .. Executable Statements ..
+ DDIFF = X - Y
+ RETURN
+*
+* End of DDIFF.
+*
+ END
+ SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK )
+*
+* Tests whether XERBLA has detected an error when it should.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER INFOT, NOUT
+ LOGICAL LERR, OK
+ CHARACTER*8 SRNAMT
+* .. Executable Statements ..
+ IF( .NOT.LERR )THEN
+ WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT
+ OK = .FALSE.
+ END IF
+ LERR = .FALSE.
+ RETURN
+*
+ 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D',
+ $ 'ETECTED BY ', A8, ' *****' )
+*
+* End of CHKXER.
+*
+ END
+ SUBROUTINE XERBLA( SRNAME, INFO )
+*
+* This is a special version of XERBLA to be used only as part of
+* the test program for testing error exits from the Level 3 BLAS
+* routines.
+*
+* XERBLA is an error handler for the Level 3 BLAS routines.
+*
+* It is called by the Level 3 BLAS routines if an input parameter is
+* invalid.
+*
+* Auxiliary routine for test program for Level 3 Blas.
+*
+* -- Written on 8-February-1989.
+* Jack Dongarra, Argonne National Laboratory.
+* Iain Duff, AERE Harwell.
+* Jeremy Du Croz, Numerical Algorithms Group Ltd.
+* Sven Hammarling, Numerical Algorithms Group Ltd.
+*
+* .. Scalar Arguments ..
+ INTEGER INFO
+ CHARACTER*8 SRNAME
+* .. Scalars in Common ..
+ INTEGER INFOT, NOUT
+ LOGICAL LERR, OK
+ CHARACTER*8 SRNAMT
+* .. Common blocks ..
+ COMMON /INFOC/INFOT, NOUT, OK, LERR
+ COMMON /SRNAMC/SRNAMT
+* .. Executable Statements ..
+ LERR = .TRUE.
+ IF( INFO.NE.INFOT )THEN
+ IF( INFOT.NE.0 )THEN
+ WRITE( NOUT, FMT = 9999 )INFO, INFOT
+ ELSE
+ WRITE( NOUT, FMT = 9997 )INFO
+ END IF
+ OK = .FALSE.
+ END IF
+ IF( SRNAME.NE.SRNAMT )THEN
+ WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT
+ OK = .FALSE.
+ END IF
+ RETURN
+*
+ 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD',
+ $ ' OF ', I2, ' *******' )
+ 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A8, ' INSTE',
+ $ 'AD OF ', A8, ' *******' )
+ 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6,
+ $ ' *******' )
+*
+* End of XERBLA
+*
+ END
+