summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>2017-07-15 10:40:42 +0200
committerGitHub <noreply@github.com>2017-07-15 10:40:42 +0200
commit7294fb1d9db59bc8e7b58bd4dd758003092f0886 (patch)
treeb70627a4f172eb32fd4f7aed97172b4836be95dd
parent88249ca5f793f9d18584e5388a88054651b9bb7b (diff)
parent31e086d6a66658e7b04390e69884fe569e0a1e9d (diff)
downloadopenblas-7294fb1d9db59bc8e7b58bd4dd758003092f0886.tar.gz
openblas-7294fb1d9db59bc8e7b58bd4dd758003092f0886.tar.bz2
openblas-7294fb1d9db59bc8e7b58bd4dd758003092f0886.zip
Merge branch 'develop' into cgroups
-rw-r--r--CMakeLists.txt9
-rw-r--r--Makefile25
-rw-r--r--Makefile.arm23
-rw-r--r--Makefile.arm644
-rw-r--r--Makefile.rule5
-rw-r--r--Makefile.system21
-rw-r--r--cmake/c_check.cmake5
-rw-r--r--cmake/os.cmake2
-rw-r--r--cmake/prebuild.cmake68
-rw-r--r--common.h13
-rw-r--r--common_arm.h5
-rw-r--r--driver/level2/gbmv_thread.c2
-rw-r--r--driver/level2/sbmv_thread.c2
-rw-r--r--driver/level2/spmv_thread.c2
-rw-r--r--driver/level2/tbmv_thread.c2
-rw-r--r--driver/level2/tpmv_thread.c2
-rw-r--r--driver/level2/trmv_thread.c2
-rw-r--r--driver/level3/syrk_thread.c4
-rw-r--r--driver/others/CMakeLists.txt2
-rw-r--r--driver/others/blas_server_win32.c7
-rw-r--r--driver/others/init.c5
-rw-r--r--kernel/arm/KERNEL.ARMV663
-rw-r--r--kernel/arm/KERNEL.ARMV7114
-rw-r--r--kernel/arm/asum_vfp.S8
-rw-r--r--kernel/arm/axpy_vfp.S79
-rw-r--r--kernel/arm/cdot_vfp.S32
-rw-r--r--kernel/arm/cgemm_kernel_2x2_vfp.S30
-rw-r--r--kernel/arm/cgemm_kernel_2x2_vfpv3.S30
-rw-r--r--kernel/arm/cgemv_n_vfp.S44
-rw-r--r--kernel/arm/cgemv_t_vfp.S44
-rw-r--r--kernel/arm/ctrmm_kernel_2x2_vfp.S31
-rw-r--r--kernel/arm/ctrmm_kernel_2x2_vfpv3.S31
-rw-r--r--kernel/arm/ddot_vfp.S3
-rw-r--r--kernel/arm/dgemm_kernel_4x2_vfp.S13
-rw-r--r--kernel/arm/dgemm_kernel_4x4_vfpv3.S12
-rw-r--r--kernel/arm/dtrmm_kernel_4x2_vfp.S13
-rw-r--r--kernel/arm/dtrmm_kernel_4x4_vfpv3.S13
-rw-r--r--kernel/arm/gemv_n_vfp.S44
-rw-r--r--kernel/arm/gemv_n_vfpv3.S58
-rw-r--r--kernel/arm/gemv_t_vfp.S54
-rw-r--r--kernel/arm/gemv_t_vfpv3.S44
-rw-r--r--kernel/arm/nrm2_vfp.S7
-rw-r--r--kernel/arm/nrm2_vfpv3.S9
-rw-r--r--kernel/arm/rot_vfp.S91
-rw-r--r--kernel/arm/scal_vfp.S24
-rw-r--r--kernel/arm/sdot_vfp.S13
-rw-r--r--kernel/arm/sgemm_kernel_4x2_vfp.S12
-rw-r--r--kernel/arm/sgemm_kernel_4x4_vfpv3.S29
-rw-r--r--kernel/arm/strmm_kernel_4x2_vfp.S13
-rw-r--r--kernel/arm/strmm_kernel_4x4_vfpv3.S13
-rw-r--r--kernel/arm/swap_vfp.S37
-rw-r--r--kernel/arm/zdot_vfp.S32
-rw-r--r--kernel/arm/zgemm_kernel_2x2_vfp.S30
-rw-r--r--kernel/arm/zgemm_kernel_2x2_vfpv3.S30
-rw-r--r--kernel/arm/zgemv_n_vfp.S44
-rw-r--r--kernel/arm/zgemv_t_vfp.S44
-rw-r--r--kernel/arm/ztrmm_kernel_2x2_vfp.S31
-rw-r--r--kernel/arm/ztrmm_kernel_2x2_vfpv3.S31
-rw-r--r--kernel/power/casum_microk_power8.c32
-rw-r--r--kernel/power/ccopy_microk_power8.c128
-rw-r--r--kernel/power/cswap_microk_power8.c128
-rw-r--r--kernel/power/sasum_microk_power8.c32
-rw-r--r--kernel/power/scopy_microk_power8.c64
-rw-r--r--kernel/power/sdot_microk_power8.c64
-rw-r--r--kernel/power/srot_microk_power8.c64
-rw-r--r--kernel/power/sscal_microk_power8.c80
-rw-r--r--kernel/power/sswap_microk_power8.c64
-rw-r--r--relapack/LICENSE22
-rw-r--r--relapack/Makefile98
-rw-r--r--relapack/README.md68
-rw-r--r--relapack/config.h208
-rw-r--r--relapack/config.md87
-rw-r--r--relapack/coverage.md212
-rw-r--r--relapack/inc/relapack.h67
-rw-r--r--relapack/src/blas.h61
-rw-r--r--relapack/src/cgbtrf.c230
-rw-r--r--relapack/src/cgemmt.c167
-rw-r--r--relapack/src/cgetrf.c117
-rw-r--r--relapack/src/chegst.c212
-rw-r--r--relapack/src/chetrf.c236
-rw-r--r--relapack/src/chetrf_rec2.c520
-rw-r--r--relapack/src/chetrf_rook.c236
-rw-r--r--relapack/src/chetrf_rook_rec2.c661
-rw-r--r--relapack/src/clauum.c87
-rw-r--r--relapack/src/cpbtrf.c157
-rw-r--r--relapack/src/cpotrf.c92
-rw-r--r--relapack/src/csytrf.c238
-rw-r--r--relapack/src/csytrf_rec2.c451
-rw-r--r--relapack/src/csytrf_rook.c236
-rw-r--r--relapack/src/csytrf_rook_rec2.c565
-rw-r--r--relapack/src/ctgsyl.c268
-rw-r--r--relapack/src/ctrsyl.c163
-rw-r--r--relapack/src/ctrsyl_rec2.c392
-rw-r--r--relapack/src/ctrtri.c107
-rw-r--r--relapack/src/dgbtrf.c227
-rw-r--r--relapack/src/dgemmt.c165
-rw-r--r--relapack/src/dgetrf.c117
-rw-r--r--relapack/src/dlauum.c87
-rw-r--r--relapack/src/dpbtrf.c157
-rw-r--r--relapack/src/dpotrf.c92
-rw-r--r--relapack/src/dsygst.c212
-rw-r--r--relapack/src/dsytrf.c238
-rw-r--r--relapack/src/dsytrf_rec2.c352
-rw-r--r--relapack/src/dsytrf_rook.c236
-rw-r--r--relapack/src/dsytrf_rook_rec2.c451
-rw-r--r--relapack/src/dtgsyl.c274
-rw-r--r--relapack/src/dtrsyl.c169
-rw-r--r--relapack/src/dtrsyl_rec2.c1034
-rw-r--r--relapack/src/dtrtri.c107
-rw-r--r--relapack/src/f2c.c109
-rw-r--r--relapack/src/f2c.h223
-rw-r--r--relapack/src/lapack.h80
-rw-r--r--relapack/src/lapack_wrappers.c607
-rw-r--r--relapack/src/lapack_wrappers.c.orig607
-rw-r--r--relapack/src/relapack.h60
-rw-r--r--relapack/src/sgbtrf.c227
-rw-r--r--relapack/src/sgemmt.c165
-rw-r--r--relapack/src/sgetrf.c117
-rw-r--r--relapack/src/slauum.c87
-rw-r--r--relapack/src/spbtrf.c157
-rw-r--r--relapack/src/spotrf.c92
-rw-r--r--relapack/src/ssygst.c212
-rw-r--r--relapack/src/ssytrf.c238
-rw-r--r--relapack/src/ssytrf_rec2.c351
-rw-r--r--relapack/src/ssytrf_rook.c236
-rw-r--r--relapack/src/ssytrf_rook_rec2.c451
-rw-r--r--relapack/src/stgsyl.c274
-rw-r--r--relapack/src/strsyl.c169
-rw-r--r--relapack/src/strsyl_rec2.c1029
-rw-r--r--relapack/src/strtri.c107
-rw-r--r--relapack/src/zgbtrf.c230
-rw-r--r--relapack/src/zgemmt.c167
-rw-r--r--relapack/src/zgetrf.c117
-rw-r--r--relapack/src/zhegst.c212
-rw-r--r--relapack/src/zhetrf.c236
-rw-r--r--relapack/src/zhetrf_rec2.c524
-rw-r--r--relapack/src/zhetrf_rook.c236
-rw-r--r--relapack/src/zhetrf_rook_rec2.c662
-rw-r--r--relapack/src/zlauum.c87
-rw-r--r--relapack/src/zpbtrf.c157
-rw-r--r--relapack/src/zpotrf.c92
-rw-r--r--relapack/src/zsytrf.c238
-rw-r--r--relapack/src/zsytrf_rec2.c452
-rw-r--r--relapack/src/zsytrf_rook.c236
-rw-r--r--relapack/src/zsytrf_rook_rec2.c561
-rw-r--r--relapack/src/ztgsyl.c268
-rw-r--r--relapack/src/ztrsyl.c163
-rw-r--r--relapack/src/ztrsyl_rec2.c394
-rw-r--r--relapack/src/ztrtri.c107
-rw-r--r--relapack/test/README.md48
-rw-r--r--relapack/test/config.h13
-rw-r--r--relapack/test/lapack.h64
-rw-r--r--relapack/test/test.h136
-rw-r--r--relapack/test/util.c116
-rw-r--r--relapack/test/util.h15
-rw-r--r--relapack/test/xgbtrf.c43
-rw-r--r--relapack/test/xgemmt.c65
-rw-r--r--relapack/test/xgetrf.c32
-rw-r--r--relapack/test/xhegst.c32
-rw-r--r--relapack/test/xhetrf.c40
-rw-r--r--relapack/test/xlauum.c25
-rw-r--r--relapack/test/xpbtrf.c40
-rw-r--r--relapack/test/xpotrf.c25
-rw-r--r--relapack/test/xsygst.c32
-rw-r--r--relapack/test/xsytrf.c40
-rw-r--r--relapack/test/xtgsyl.c94
-rw-r--r--relapack/test/xtrsyl.c65
-rw-r--r--relapack/test/xtrtri.c25
-rw-r--r--utest/CMakeLists.txt4
169 files changed, 22840 insertions, 843 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c20a57eac..e6ae891b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
)
- ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h)
+
+ ADD_CUSTOM_TARGET(genconfig
+ ALL
+ DEPENDS openblas_config.h
+ )
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
@@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME}
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(genf77blas
+ ALL
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
)
@@ -255,11 +260,11 @@ if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(gencblas
+ ALL
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
)
-
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/Makefile b/Makefile
index 27923aa9b..1b9bcb118 100644
--- a/Makefile
+++ b/Makefile
@@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
endif
+RELA =
+ifeq ($(BUILD_RELAPACK), 1)
+RELA = re_lapack
+endif
+
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
-.PHONY : all libs netlib test ctest shared install
-.NOTPARALLEL : all libs prof lapack-test install blas-test
+.PHONY : all libs netlib $(RELA) test ctest shared install
+.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
-all :: libs netlib tests shared
+all :: libs netlib $(RELA) tests shared
@echo
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
@echo
@@ -215,6 +220,14 @@ ifndef NO_LAPACKE
endif
endif
+ifeq ($(NO_LAPACK), 1)
+re_lapack :
+
+else
+re_lapack :
+ @$(MAKE) -C relapack
+endif
+
prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
@@ -326,11 +339,7 @@ endif
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
+ @$(MAKE) -C relapack clean
@rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
@echo Done.
-
-# Makefile debugging trick:
-# call print-VARIABLE to see the runtime value of any variable
-print-%:
- @echo '$*=$($*)'
diff --git a/Makefile.arm b/Makefile.arm
index c189b0c47..eedd39b73 100644
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -1,5 +1,4 @@
-#ifeq logical or
-ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
+ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
@@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
endif
endif
-ifeq ($(CORE), ARMV7)
-ifeq ($(OSNAME), Android)
-ifeq ($(ARM_SOFTFP_ABI), 1)
-CCOMMON_OPT += -mfpu=neon -march=armv7-a
-FCOMMON_OPT += -mfpu=neon -march=armv7-a
-else
-CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
-FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
-endif
-else
-CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
-FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
-endif
-endif
-
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
-
ifeq ($(CORE), ARMV5)
-CCOMMON_OPT += -marm -march=armv5
-FCOMMON_OPT += -marm -march=armv5
+CCOMMON_OPT += -march=armv5
+FCOMMON_OPT += -march=armv5
endif
diff --git a/Makefile.arm64 b/Makefile.arm64
index 7e9df2f4b..d19e796a5 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
endif
ifeq ($(CORE), THUNDERX2T99)
-CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
-FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
+CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
+FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
endif
diff --git a/Makefile.rule b/Makefile.rule
index b6c22f798..8d8aecdc9 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -83,6 +83,9 @@ VERSION = 0.2.20.dev
# Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1
+# Build RecursiveLAPACK on top of LAPACK
+# BUILD_RELAPACK = 1
+
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
@@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
-NO_AFFINITY = 1
+#NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1
diff --git a/Makefile.system b/Makefile.system
index 29d3efd53..bd361a1a2 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -242,6 +242,10 @@ EXTRALIB += -lm
NO_EXPRECISION = 1
endif
+ifeq ($(OSNAME), Android)
+EXTRALIB += -lm
+endif
+
ifeq ($(OSNAME), AIX)
EXTRALIB += -lm
endif
@@ -486,12 +490,18 @@ BINARY_DEFINED = 1
CCOMMON_OPT += -marm
FCOMMON_OPT += -marm
+# If softfp abi is mentioned on the command line, force it.
ifeq ($(ARM_SOFTFP_ABI), 1)
-CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
-FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
+CCOMMON_OPT += -mfloat-abi=softfp
+FCOMMON_OPT += -mfloat-abi=softfp
+endif
+
+ifeq ($(OSNAME), Android)
+ifeq ($(ARM_SOFTFP_ABI), 1)
+EXTRALIB += -lm
else
-CCOMMON_OPT += -mfloat-abi=hard
-FCOMMON_OPT += -mfloat-abi=hard
+EXTRALIB += -Wl,-lm_hard
+endif
endif
endif
@@ -1119,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
ifneq ($(NO_LAPACKE), 1)
LIB_COMPONENTS += LAPACKE
endif
+ifeq ($(BUILD_RELAPACK), 1)
+LIB_COMPONENTS += ReLAPACK
+endif
endif
ifeq ($(ONLY_CBLAS), 1)
diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake
index 115bdaf4e..56ae612ea 100644
--- a/cmake/c_check.cmake
+++ b/cmake/c_check.cmake
@@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF}
"#define __${BINARY}BIT__\t1\n"
"#define FUNDERSCORE\t${FU}\n")
+if (${HOST_OS} STREQUAL "WINDOWSSTORE")
+ file(APPEND ${TARGET_CONF}
+ "#define OS_WINNT\t1\n")
+endif ()
+
diff --git a/cmake/os.cmake b/cmake/os.cmake
index f5a75027c..e9df68d7f 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -77,7 +77,7 @@ if (CYGWIN)
set(NO_EXPRECISION 1)
endif ()
-if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
+if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
if (SMP)
set(EXTRALIB "${EXTRALIB} -lpthread")
endif ()
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 6a21c0bcc..a7f98bfb8 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -72,20 +72,26 @@ if (MSVC)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
endif()
+if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+ # disable WindowsStore strict CRT checks
+ set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
+endif ()
+
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
-try_compile(GETARCH_RESULT ${GETARCH_DIR}
- SOURCES ${GETARCH_SRC}
- COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
- OUTPUT_VARIABLE GETARCH_LOG
- COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
-)
-
-if (NOT ${GETARCH_RESULT})
- MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+ try_compile(GETARCH_RESULT ${GETARCH_DIR}
+ SOURCES ${GETARCH_SRC}
+ COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
+ OUTPUT_VARIABLE GETARCH_LOG
+ COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
+ )
+
+ if (NOT ${GETARCH_RESULT})
+ MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
+ endif ()
endif ()
-
message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
-try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
- SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
- COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
- OUTPUT_VARIABLE GETARCH2_LOG
- COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
-)
-
-if (NOT ${GETARCH2_RESULT})
- MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+ try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
+ SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
+ COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
+ OUTPUT_VARIABLE GETARCH2_LOG
+ COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
+ )
+
+ if (NOT ${GETARCH2_RESULT})
+ MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
+ endif ()
endif ()
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
-try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
- SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
- COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
- OUTPUT_VARIABLE GEN_CONFIG_H_LOG
- COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
-)
-
-if (NOT ${GEN_CONFIG_H_RESULT})
- MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
-endif ()
+if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
+ try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
+ SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
+ COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
+ OUTPUT_VARIABLE GEN_CONFIG_H_LOG
+ COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
+ )
+
+ if (NOT ${GEN_CONFIG_H_RESULT})
+ MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
+ endif ()
+endif () \ No newline at end of file
diff --git a/common.h b/common.h
index c9cc2f0f2..4463141c8 100644
--- a/common.h
+++ b/common.h
@@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#endif
#ifndef ASSEMBLER
+#ifdef OS_WINDOWSSTORE
+typedef char env_var_t[MAX_PATH];
+#define readenv(p, n) 0
+#else
#ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n))
#endif
+#endif
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
#ifdef _POSIX_MONOTONIC_CLOCK
@@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
*address = 0;
}
-
+#ifdef OS_WINDOWSSTORE
+static __inline int readenv_atoi(char *env) {
+ return 0;
+}
+#else
#ifdef OS_WINDOWS
static __inline int readenv_atoi(char *env) {
env_var_t p;
@@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
return(0);
}
#endif
-
+#endif
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
diff --git a/common_arm.h b/common_arm.h
index a17acb448..27fa76b76 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -111,11 +111,6 @@ REALNAME:
#define PROFCODE
-#ifdef __ARM_PCS
-//-mfloat-abi=softfp
-#define SOFT_FLOAT_ABI
-#endif
-
#endif
diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c
index ef9d58d76..e86b565f8 100644
--- a/driver/level2/gbmv_thread.c
+++ b/driver/level2/gbmv_thread.c
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
- BLASLONG range_m[MAX_CPU_NUMBER];
+ BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;
diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c
index a0377d638..5718c0ec9 100644
--- a/driver/level2/sbmv_thread.c
+++ b/driver/level2/sbmv_thread.c
@@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#endif
blas_arg_t args;
- blas_queue_t queue[MAX_CPU_NUMBER];
+ blas_queue_t queue[MAX_CPU_NUMBER + 1];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c
index f8ae3cdcd..035300841 100644
--- a/driver/level2/spmv_thread.c
+++ b/driver/level2/spmv_thread.c
@@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
- BLASLONG range_n[MAX_CPU_NUMBER];
+ BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;
diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c
index bbb1c50eb..226a922e9 100644
--- a/driver/level2/tbmv_thread.c
+++ b/driver/level2/tbmv_thread.c
@@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
- BLASLONG range_n[MAX_CPU_NUMBER];
+ BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;
diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c
index 47dc1daf9..c91b52775 100644
--- a/driver/level2/tpmv_thread.c
+++ b/driver/level2/tpmv_thread.c
@@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
- BLASLONG range_n[MAX_CPU_NUMBER];
+ BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;
diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c
index 42edb83cb..0a155366c 100644
--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
- BLASLONG range_n[MAX_CPU_NUMBER];
+ BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;
diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c
index 94274be72..5f40853dc 100644
--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)i;
- width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
+ width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i);
- width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
+ width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt
index 489d40c76..8e0be1e0e 100644
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -12,6 +12,8 @@ if (SMP)
set(BLAS_SERVER blas_server_omp.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(BLAS_SERVER blas_server_win32.c)
+ elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
+ set(BLAS_SERVER blas_server_win32.c)
endif ()
if (NOT DEFINED BLAS_SERVER)
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 081bdd7d4..cde8ca793 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
- WaitForSingleObject(blas_threads[i], 5); //INFINITE);
- TerminateThread(blas_threads[i],0);
+ WaitForSingleObject(blas_threads[i], 5); //INFINITE);
+#ifndef OS_WINDOWSSTORE
+// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
+ TerminateThread(blas_threads[i],0);
+#endif
}
blas_server_avail = 0;
diff --git a/driver/others/init.c b/driver/others/init.c
index 4093776db..1f07a24ac 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -825,10 +825,11 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
- if (common -> magic != SH_MAGIC) {
+ if (common -> magic != SH_MAGIC)
cpu_set_t *cpusetp;
int nums;
int ret;
+
#ifdef DEBUG
fprintf(stderr, "Shared Memory Initialization.\n");
#endif
@@ -883,7 +884,7 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = 0;
- for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
+ for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 16bde105b..960dae67b 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -1,7 +1,5 @@
+include $(KERNELDIR)/KERNEL.ARMV5
-
-
-###############################################################################
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
@@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
-SCOPYKERNEL = copy.c
-DCOPYKERNEL = copy.c
-CCOPYKERNEL = zcopy.c
-ZCOPYKERNEL = zcopy.c
+SROTKERNEL = rot_vfp.S
+DROTKERNEL = rot_vfp.S
+CROTKERNEL = rot_vfp.S
+ZROTKERNEL = rot_vfp.S
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
@@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S
CNRM2KERNEL = nrm2_vfp.S
ZNRM2KERNEL = nrm2_vfp.S
-SROTKERNEL = rot_vfp.S
-DROTKERNEL = rot_vfp.S
-CROTKERNEL = rot_vfp.S
-ZROTKERNEL = rot_vfp.S
-
-SSCALKERNEL = scal.c
-DSCALKERNEL = scal.c
-CSCALKERNEL = zscal.c
-ZSCALKERNEL = zscal.c
-
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
@@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
-STRMMKERNEL = strmm_kernel_4x2_vfp.S
-DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
-CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
-ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
-
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
+endif
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
-SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ = sgemm_oncopy.o
-SGEMMOTCOPYOBJ = sgemm_otcopy.o
+SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
+endif
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
@@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
-STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-
-
+STRMMKERNEL = strmm_kernel_4x2_vfp.S
+DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
+CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
+ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7
index d5cd94fbd..5e0b4cfb8 100644
--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -1,91 +1,12 @@
-
-#################################################################################
-SAMAXKERNEL = iamax_vfp.S
-DAMAXKERNEL = iamax_vfp.S
-CAMAXKERNEL = iamax_vfp.S
-ZAMAXKERNEL = iamax_vfp.S
-
-SAMINKERNEL = iamax_vfp.S
-DAMINKERNEL = iamax_vfp.S
-CAMINKERNEL = iamax_vfp.S
-ZAMINKERNEL = iamax_vfp.S
-
-SMAXKERNEL = iamax_vfp.S
-DMAXKERNEL = iamax_vfp.S
-
-SMINKERNEL = iamax_vfp.S
-DMINKERNEL = iamax_vfp.S
-
-ISAMAXKERNEL = iamax_vfp.S
-IDAMAXKERNEL = iamax_vfp.S
-ICAMAXKERNEL = iamax_vfp.S
-IZAMAXKERNEL = iamax_vfp.S
-
-ISAMINKERNEL = iamax_vfp.S
-IDAMINKERNEL = iamax_vfp.S
-ICAMINKERNEL = iamax_vfp.S
-IZAMINKERNEL = iamax_vfp.S
-
-ISMAXKERNEL = iamax_vfp.S
-IDMAXKERNEL = iamax_vfp.S
-
-ISMINKERNEL = iamax_vfp.S
-IDMINKERNEL = iamax_vfp.S
-
-SSWAPKERNEL = swap_vfp.S
-DSWAPKERNEL = swap_vfp.S
-CSWAPKERNEL = swap_vfp.S
-ZSWAPKERNEL = swap_vfp.S
-
-SASUMKERNEL = asum_vfp.S
-DASUMKERNEL = asum_vfp.S
-CASUMKERNEL = asum_vfp.S
-ZASUMKERNEL = asum_vfp.S
-
-SAXPYKERNEL = axpy_vfp.S
-DAXPYKERNEL = axpy_vfp.S
-CAXPYKERNEL = axpy_vfp.S
-ZAXPYKERNEL = axpy_vfp.S
-
-SCOPYKERNEL = copy.c
-DCOPYKERNEL = copy.c
-CCOPYKERNEL = zcopy.c
-ZCOPYKERNEL = zcopy.c
-
-SDOTKERNEL = sdot_vfp.S
-DDOTKERNEL = ddot_vfp.S
-CDOTKERNEL = cdot_vfp.S
-ZDOTKERNEL = zdot_vfp.S
+include $(KERNELDIR)/KERNEL.ARMV6
SNRM2KERNEL = nrm2_vfpv3.S
DNRM2KERNEL = nrm2_vfpv3.S
CNRM2KERNEL = nrm2_vfpv3.S
ZNRM2KERNEL = nrm2_vfpv3.S
-SROTKERNEL = rot_vfp.S
-DROTKERNEL = rot_vfp.S
-CROTKERNEL = rot_vfp.S
-ZROTKERNEL = rot_vfp.S
-
-SSCALKERNEL = scal.c
-DSCALKERNEL = scal.c
-CSCALKERNEL = zscal.c
-ZSCALKERNEL = zscal.c
-
SGEMVNKERNEL = gemv_n_vfpv3.S
DGEMVNKERNEL = gemv_n_vfpv3.S
-CGEMVNKERNEL = cgemv_n_vfp.S
-ZGEMVNKERNEL = zgemv_n_vfp.S
-
-SGEMVTKERNEL = gemv_t_vfp.S
-DGEMVTKERNEL = gemv_t_vfp.S
-CGEMVTKERNEL = cgemv_t_vfp.S
-ZGEMVTKERNEL = zgemv_t_vfp.S
-
-STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
-DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
-CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
-ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
@@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
-CGEMMONCOPY = cgemm_ncopy_2_vfp.S
-CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
-CGEMMONCOPYOBJ = cgemm_oncopy.o
-CGEMMOTCOPYOBJ = cgemm_otcopy.o
-
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
-ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
-ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
-ZGEMMONCOPYOBJ = zgemm_oncopy.o
-ZGEMMOTCOPYOBJ = zgemm_otcopy.o
-
-STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
+DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
+CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
+ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S
index fe6242a5b..5b08e5028 100644
--- a/kernel/arm/asum_vfp.S
+++ b/kernel/arm/asum_vfp.S
@@ -475,6 +475,14 @@ asum_kernel_L999:
vadd.f32 s0 , s0, s1 // set return value
#endif
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+ vmov r0, s0
+#else
+ vmov r0, r1, d0
+#endif
+#endif
+
bx lr
EPILOGUE
diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S
index 8e5334f62..37515f399 100644
--- a/kernel/arm/axpy_vfp.S
+++ b/kernel/arm/axpy_vfp.S
@@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#ifndef ARM_SOFTFP_ABI
-//hard
-#define OLD_INC_X [fp, #0 ]
-#define OLD_Y [fp, #4 ]
-#define OLD_INC_Y [fp, #8 ]
-#else
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(COMPLEX)
+
+#if !defined(DOUBLE)
+#define OLD_ALPHA r3
#define OLD_X [fp, #0 ]
#define OLD_INC_X [fp, #4 ]
#define OLD_Y [fp, #8 ]
#define OLD_INC_Y [fp, #12 ]
+#else
+#define OLD_ALPHA [fp, #0]
+#define OLD_X [fp, #8 ]
+#define OLD_INC_X [fp, #12 ]
+#define OLD_Y [fp, #16 ]
+#define OLD_INC_Y [fp, #20 ]
#endif
-
+
+#else //COMPLEX
+
+#if !defined(DOUBLE)
+#define OLD_ALPHAR r3
+#define OLD_ALPHAI [fp, #0 ]
+#define OLD_X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define OLD_Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#else
+#define OLD_ALPHAR [fp, #0]
+#define OLD_ALPHAI [fp, #8]
+#define OLD_X [fp, #16 ]
+#define OLD_INC_X [fp, #20 ]
+#define OLD_Y [fp, #24 ]
+#define OLD_INC_Y [fp, #28 ]
+#endif
+
+#endif //!defined(COMPLEX)
+
+#else //__ARM_PCS_VFP
+
+#define OLD_INC_X [fp, #0 ]
+#define OLD_Y [fp, #4 ]
+#define OLD_INC_Y [fp, #8 ]
+
+#endif //!defined(__ARM_PCS_VFP)
+
#define N r0
#define Y r1
#define INC_X r2
@@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
#define FMAC_R1 fmacd
-#define FMAC_R2 fnmacd
+#define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
-#define FMAC_R2 fnmacs
+#define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
@@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
-#define FMAC_I1 fnmacd
+#define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
-#define FMAC_I1 fnmacs
+#define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #8
sub sp, sp, #STACKSIZE // reserve stack
-#ifdef ARM_SOFTFP_ABI
-#ifndef DOUBLE
- vmov s0, r3 //move alpha to s0
+#if !defined(__ARM_PCS_VFP)
+#if !defined(COMPLEX)
+#if !defined(DOUBLE)
+ vmov s0, OLD_ALPHA
ldr X, OLD_X
+#else
+ vldr d0, OLD_ALPHA
+ ldr X, OLD_X
+#endif
+#else //COMPLEX
+#if !defined(DOUBLE)
+ vmov s0, OLD_ALPHAR
+ vldr s1, OLD_ALPHAI
+ ldr X, OLD_X
+#else
+ vldr d0, OLD_ALPHAR
+ vldr d1, OLD_ALPHAI
+ ldr X, OLD_X
+#endif
#endif
#endif
-
+
ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y
diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S
index 0497b6d83..e5a6e4d35 100644
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r0
#define X r1
#define INC_X r2
-#define OLD_Y r3
-
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* registers
*******************************************************/
-#define OLD_INC_Y [fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_RETURN_ADDR r0
+#define OLD_N r1
+#define OLD_X r2
+#define OLD_INC_X r3
+#define OLD_Y [fp, #0 ]
+#define OLD_INC_Y [fp, #4 ]
+#define RETURN_ADDR r8
+#else
+#define OLD_Y r3
+#define OLD_INC_Y [fp, #0 ]
+#endif
#define I r5
#define Y r6
@@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
push {r4 - r9, fp}
- add fp, sp, #24
+ add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
@@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmov s2, s0
vmov s3, s0
+#if !defined(__ARM_PCS_VFP)
+ mov RETURN_ADDR, OLD_RETURN_ADDR
+ mov N, OLD_N
+ mov X, OLD_X
+ mov INC_X, OLD_INC_X
+ ldr Y, OLD_Y
+ ldr INC_Y, OLD_INC_Y
+#else
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
+#endif
cmp N, #0
ble cdot_kernel_L999
@@ -265,7 +283,6 @@ cdot_kernel_S10:
cdot_kernel_L999:
-
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
@@ -276,8 +293,11 @@ cdot_kernel_L999:
vadd.f32 s0 , s0, s2
vsub.f32 s1 , s1, s3
#endif
+#if !defined(__ARM_PCS_VFP)
+ vstm RETURN_ADDR, {s0 - s1}
+#endif
- sub sp, fp, #24
+ sub sp, fp, #28
pop {r4 - r9, fp}
bx lr
diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S
index f0517cb47..71bc50efd 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP r3
+#define OLD_ALPHAI_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define B [fp, #12 ]
+#define C [fp, #16 ]
+#define OLD_LDC [fp, #20 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
+#endif
#define I r0
#define J r1
@@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CN) || defined(CT)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
index cf132a184..9d473ad78 100644
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP r3
+#define OLD_ALPHAI_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define B [fp, #12 ]
+#define C [fp, #16 ]
+#define OLD_LDC [fp, #20 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
+#endif
#define I r0
#define J r1
@@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
- #define FMAC_R1 fnmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R1 vmls.f32
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
- #define FMAC_I2 fnmacs
+ #define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT)
@@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
@@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
- #define FMAC_R1 fnmacs
+ #define FMAC_R1 vmls.f32
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
- #define FMAC_I2 fnmacs
+ #define FMAC_I1 vmls.f32
+ #define FMAC_I2 vmls.f32
#endif
@@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S
index 5d2748644..62ee33bb9 100644
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR r3
+#define OLD_ALPHAI [fp, #0 ]
+#define OLD_A_SOFTFP [fp, #4 ]
+#define OLD_LDA [fp, #8 ]
+#define X [fp, #12 ]
+#define OLD_INC_X [fp, #16 ]
+#define Y [fp, #20 ]
+#define OLD_INC_Y [fp, #24 ]
+#else
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#endif
+
#define OLD_A r3
#define OLD_M r0
@@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble cgemvn_kernel_L999
+#if !defined(__ARM_PCS_VFP)
+ vmov s0, OLD_ALPHAR
+ vldr s1, OLD_ALPHAI
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_A, A
str OLD_M, M
vstr s0 , ALPHA_R
diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S
index 76c8a8f18..c07b6d6f8 100644
--- a/kernel/arm/cgemv_t_vfp.S
+++ b/kernel/arm/cgemv_t_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR r3
+#define OLD_ALPHAI [fp, #0 ]
+#define OLD_A_SOFTFP [fp, #4 ]
+#define OLD_LDA [fp, #8 ]
+#define X [fp, #12 ]
+#define OLD_INC_X [fp, #16 ]
+#define Y [fp, #20 ]
+#define OLD_INC_Y [fp, #24 ]
+#else
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#endif
+
#define OLD_A r3
#define OLD_N r1
@@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble cgemvt_kernel_L999
+#if !defined(__ARM_PCS_VFP)
+ vmov s0, OLD_ALPHAR
+ vldr s1, OLD_ALPHAI
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_A, A
str OLD_N, N
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S
index 8cb7ede9d..aae890ea9 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
@@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP r3
+#define OLD_ALPHAI_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define B [fp, #12 ]
+#define C [fp, #16 ]
+#define OLD_LDC [fp, #20 ]
+#define OFFSET [fp, #24 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CN) || defined(CT)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
#define KMAC_R fmacs
- #define KMAC_I fnmacs
+ #define KMAC_I vmls.f32
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#else
- #define KMAC_R fnmacs
+ #define KMAC_R vmls.f32
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmacs
+ #define FMAC_I1 vmls.f32
#define FMAC_I2 fmacs
#endif
@@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
index 97bd88c69..79e7ed07f 100644
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP r3
+#define OLD_ALPHAI_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define B [fp, #12 ]
+#define C [fp, #16 ]
+#define OLD_LDC [fp, #20 ]
+#define OFFSET [fp, #24 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
- #define FMAC_R1 fnmuls
- #define FMAC_R2 fnmacs
+ #define FMAC_R1 vnmul.f32
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls
- #define FMAC_I2 fnmacs
+ #define FMAC_I2 vmls.f32
#elif defined(CN) || defined(CT)
@@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmuls
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmuls
+ #define FMAC_I1 vnmul.f32
#define FMAC_I2 fmacs
#elif defined(NC) || defined(TC)
@@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubs
#define FMAC_R1 fmuls
- #define FMAC_R2 fnmacs
+ #define FMAC_R2 vmls.f32
#define FMAC_I1 fmuls
#define FMAC_I2 fmacs
@@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubs
#define FADD_I fadds
- #define FMAC_R1 fnmuls
+ #define FMAC_R1 vnmul.f32
#define FMAC_R2 fmacs
- #define FMAC_I1 fnmuls
- #define FMAC_I2 fnmacs
+ #define FMAC_I1 vnmul.f32
+ #define FMAC_I2 vmls.f32
#endif
@@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S
index f28acbae3..fb294d8b4 100644
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@@ -246,6 +246,9 @@ ddot_kernel_L999:
vldm r3, { d8 - d15} // restore floating point registers
vadd.f64 d0 , d0, d1 // set return value
+#if !defined(__ARM_PCS_VFP)
+ vmov r0, r1, d0
+#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S
index 183269d1b..001a6050c 100644
--- a/kernel/arm/dgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/dgemm_kernel_4x2_vfp.S
@@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
-
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #12 ]
+#define B [fp, #16 ]
+#define C [fp, #20 ]
+#define OLD_LDC [fp, #24 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
+#endif
#define I r0
#define J r1
@@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
index b14052e06..1744b54d8 100644
--- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
@@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #12 ]
+#define B [fp, #16 ]
+#define C [fp, #20 ]
+#define OLD_LDC [fp, #24 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
+#endif
#define I r0
#define J r1
@@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S
index c578d2b1e..3d6fbf8e9 100644
--- a/kernel/arm/dtrmm_kernel_4x2_vfp.S
+++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #12 ]
+#define B [fp, #16 ]
+#define OLD_C [fp, #20 ]
+#define OLD_LDC [fp, #24 ]
+#define OFFSET [fp, #28 ]
+#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
index c7e455f16..c0c6a1677 100644
--- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
@@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP [fp, #4]
+#define OLD_A_SOFTFP [fp, #12 ]
+#define B [fp, #16 ]
+#define OLD_C [fp, #20 ]
+#define OLD_LDC [fp, #24 ]
+#define OFFSET [fp, #28 ]
+#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S
index 385370b7f..7c154d741 100644
--- a/kernel/arm/gemv_n_vfp.S
+++ b/kernel/arm/gemv_n_vfp.S
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
+#define OLD_ALPHA r3
+#define OLD_A_SOFTFP [fp, #0 ]
+#define OLD_LDA [fp, #4 ]
+#define X [fp, #8 ]
+#define OLD_INC_X [fp, #12 ]
+#define Y [fp, #16 ]
+#define OLD_INC_Y [fp, #20 ]
+#else
+#define OLD_ALPHA [fp, #0 ]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define OLD_LDA [fp, #12]
+#define X [fp, #16]
+#define OLD_INC_X [fp, #20]
+#define Y [fp, #24]
+#define OLD_INC_Y [fp, #28]
+#endif
+
+#else
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+
+#endif
+
#define OLD_A r3
#define OLD_M r0
@@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble gemvn_kernel_L999
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+ vmov s0, OLD_ALPHA
+#else
+ vldr d0, OLD_ALPHA
+#endif
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_A, A
str OLD_M, M
diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S
index 93bf23e49..54f958b7b 100644
--- a/kernel/arm/gemv_n_vfpv3.S
+++ b/kernel/arm/gemv_n_vfpv3.S
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#ifndef ARM_SOFTFP_ABI
-//hard
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
-#define OLD_A r3
-#else
-#define OLD_A_SOFTFP [fp, #0 ]
-#define OLD_LDA [fp, #4 ]
-#define X [fp, #8 ]
-#define OLD_INC_X [fp, #12 ]
-#define Y [fp, #16 ]
-#define OLD_INC_Y [fp, #20 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
#define OLD_ALPHA r3
-#define OLD_A r3
+#define OLD_A_SOFTFP [fp, #0 ]
+#define OLD_LDA [fp, #4 ]
+#define X [fp, #8 ]
+#define OLD_INC_X [fp, #12 ]
+#define Y [fp, #16 ]
+#define OLD_INC_Y [fp, #20 ]
+#else
+#define OLD_ALPHA [fp, #0 ]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define OLD_LDA [fp, #12]
+#define X [fp, #16]
+#define OLD_INC_X [fp, #20]
+#define Y [fp, #24]
+#define OLD_INC_Y [fp, #28]
+#endif
+
+#else
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+
#endif
+#define OLD_A r3
#define OLD_M r0
#define AO1 r0
@@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble gemvn_kernel_L999
-#ifndef DOUBLE
-#ifdef ARM_SOFTFP_ABI
-
- vmov s0, OLD_ALPHA
- ldr OLD_A, OLD_A_SOFTFP
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+ vmov s0, OLD_ALPHA
+#else
+ vldr d0, OLD_ALPHA
#endif
+ ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
str OLD_M, M
-
-
+
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S
index 816be54ff..9559d1829 100644
--- a/kernel/arm/gemv_t_vfp.S
+++ b/kernel/arm/gemv_t_vfp.S
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#ifndef ARM_SOFTFP_ABI
-//hard abi
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
-#define OLD_A r3
-#else
-#define OLD_A_SOFTFP [fp, #0 ]
-#define OLD_LDA [fp, #4 ]
-#define X [fp, #8 ]
-#define OLD_INC_X [fp, #12 ]
-#define Y [fp, #16 ]
-#define OLD_INC_Y [fp, #20 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
#define OLD_ALPHA r3
-#define OLD_A r3
+#define OLD_A_SOFTFP [fp, #0 ]
+#define OLD_LDA [fp, #4 ]
+#define X [fp, #8 ]
+#define OLD_INC_X [fp, #12 ]
+#define Y [fp, #16 ]
+#define OLD_INC_Y [fp, #20 ]
+#else
+#define OLD_ALPHA [fp, #0 ]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define OLD_LDA [fp, #12]
+#define X [fp, #16]
+#define OLD_INC_X [fp, #20]
+#define Y [fp, #24]
+#define OLD_INC_Y [fp, #28]
#endif
+#else
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+
+#endif
+
+#define OLD_A r3
#define OLD_N r1
#define M r0
@@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble gemvt_kernel_L999
-#ifndef DOUBLE
-#ifdef ARM_SOFTFP_ABI
- vmov s0, OLD_ALPHA
- ldr OLD_A, OLD_A_SOFTFP
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+ vmov s0, OLD_ALPHA
+#else
+ vldr d0, OLD_ALPHA
#endif
+ ldr OLD_A, OLD_A_SOFTFP
#endif
str OLD_A, A
diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S
index 7ae5799bc..b1d3dadf1 100644
--- a/kernel/arm/gemv_t_vfpv3.S
+++ b/kernel/arm/gemv_t_vfpv3.S
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(DOUBLE)
+#define OLD_ALPHA r3
+#define OLD_A_SOFTFP [fp, #0 ]
+#define OLD_LDA [fp, #4 ]
+#define X [fp, #8 ]
+#define OLD_INC_X [fp, #12 ]
+#define Y [fp, #16 ]
+#define OLD_INC_Y [fp, #20 ]
+#else
+#define OLD_ALPHA [fp, #0 ]
+#define OLD_A_SOFTFP [fp, #8 ]
+#define OLD_LDA [fp, #12]
+#define X [fp, #16]
+#define OLD_INC_X [fp, #20]
+#define Y [fp, #24]
+#define OLD_INC_Y [fp, #28]
+#endif
+
+#else
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+
+#endif
+
#define OLD_A r3
#define OLD_N r1
@@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble gemvt_kernel_L999
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+ vmov s0, OLD_ALPHA
+#else
+ vldr d0, OLD_ALPHA
+#endif
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_A, A
str OLD_N, N
diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S
index b3bd28152..16ac5a632 100644
--- a/kernel/arm/nrm2_vfp.S
+++ b/kernel/arm/nrm2_vfp.S
@@ -574,6 +574,13 @@ nrm2_kernel_L999:
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
#endif
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+ vmov r0, s0
+#else
+ vmov r0, r1, d0
+#endif
+#endif
bx lr
diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S
index 7af966895..84977901d 100644
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@@ -503,8 +503,13 @@ nrm2_kernel_L999:
#else
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
-#ifdef ARM_SOFTFP_ABI
- vmov r0, s0
+#endif
+
+#if !defined(__ARM_PCS_VFP)
+#if defined(DOUBLE)
+ vmov r0, r1, d0
+#else
+ vmov r0, s0
#endif
#endif
diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S
index d053423b6..25f563690 100644
--- a/kernel/arm/rot_vfp.S
+++ b/kernel/arm/rot_vfp.S
@@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OLD_INC_Y [fp, #0 ]
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+#define OLD_C [fp, #4]
+#define OLD_S [fp, #8]
+#else
+#define OLD_C [fp, #8]
+#define OLD_S [fp, #16]
+#endif
+#endif
#define N r0
#define X r1
@@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X, { d2 }
fstmiad Y, { d3 }
@@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X, { s2 }
fstmias Y, { s3 }
@@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
- fnmacd d3 , d1, d5
+ vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
- fnmacd d3 , d1, d5
+ vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
- fnmacd d3 , d1, d5
+ vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
- fnmacd d3 , d1, d5
+ vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
- fnmacd d3 , d1, d5
+ vmls.f64 d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
@@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
- fnmacd d3 , d1, d4
+ vmls.f64 d3 , d1, d4
vstr d2 , [ X, #0 ]
vstr d3 , [ Y, #0 ]
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
- fnmacd d3 , d1, d5
+ vmls.f64 d3 , d1, d5
vstr d2 , [ X, #8 ]
vstr d3 , [ Y, #8 ]
@@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
- fnmacs s3 , s1, s5
+ vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
- fnmacs s3 , s1, s5
+ vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
- fnmacs s3 , s1, s5
+ vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
- fnmacs s3 , s1, s5
+ vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
- fnmacs s3 , s1, s5
+ vmls.f32 s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
@@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
- fnmacs s3 , s1, s4
+ vmls.f32 s3 , s1, s4
vstr s2 , [ X, #0 ]
vstr s3 , [ Y, #0 ]
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
- fnmacs s3 , s1, s5
+ vmls.f32 s3 , s1, s5
vstr s2 , [ X, #4 ]
vstr s3 , [ Y, #4 ]
@@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #8
ldr INC_Y , OLD_INC_Y
-
+#if !defined(__ARM_PCS_VFP)
+#if !defined(DOUBLE)
+ vldr s0, OLD_C
+ vldr s1, OLD_S
+#else
+ vldr d0, OLD_C
+ vldr d1, OLD_S
+#endif
+#endif
cmp N, #0
ble rot_kernel_L999
diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S
index a8939c3a2..cc3e3b98d 100644
--- a/kernel/arm/scal_vfp.S
+++ b/kernel/arm/scal_vfp.S
@@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
- fnmacd d2, d1, d5
+ vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
- fnmacd d2, d1, d5
+ vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
@@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
- fnmacd d2, d1, d5
+ vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
- fnmacd d2, d1, d5
+ vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
@@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
- fnmacd d2, d1, d5
+ vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
@@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
- fnmacd d2, d1, d5
+ vmls.f64 d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X, { d2 - d3 }
@@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
- fnmacs s2, s1, s5
+ vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
- fnmacs s2, s1, s5
+ vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
- fnmacs s2, s1, s5
+ vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
- fnmacs s2, s1, s5
+ vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
@@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
- fnmacs s2, s1, s5
+ vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
@@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
- fnmacs s2, s1, s5
+ vmls.f32 s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X, { s2 - s3 }
diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S
index f3abdc197..5f4f424bf 100644
--- a/kernel/arm/sdot_vfp.S
+++ b/kernel/arm/sdot_vfp.S
@@ -329,20 +329,19 @@ sdot_kernel_L999:
vldm r3, { s8 - s15} // restore floating point registers
#if defined(DSDOT)
-
vadd.f64 d0 , d0, d1 // set return value
-
-#ifdef ARM_SOFTFP_ABI
- vmov r0, r1, d0
+#else
+ vadd.f32 s0 , s0, s1 // set return value
#endif
+#if !defined(__ARM_PCS_VFP)
+#if defined(DSDOT)
+ vmov r0, r1, d0
#else
-
- vadd.f32 s0 , s0, s1 // set return value
-#ifdef ARM_SOFTFP_ABI
vmov r0, s0
#endif
#endif
+
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S
index e8b44b742..1f21e5a1f 100644
--- a/kernel/arm/sgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/sgemm_kernel_4x2_vfp.S
@@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP r3
+#define OLD_A_SOFTFP [fp, #4 ]
+#define B [fp, #8 ]
+#define C [fp, #12 ]
+#define OLD_LDC [fp, #16 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
+#endif
#define I r0
#define J r1
@@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
index 86198ac90..6491d3571 100644
--- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
@@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OLD_M r0
#define OLD_N r1
#define OLD_K r2
-
-#ifdef ARM_SOFTFP_ABI
-#define OLD_ALPHA r3
-//#define OLD_A
-#else //hard
#define OLD_A r3
#define OLD_ALPHA s0
-#endif
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
@@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define M [fp, #-256 ]
#define N [fp, #-260 ]
#define K [fp, #-264 ]
-
-#ifndef ARM_SOFTFP_ABI
#define A [fp, #-268 ]
-#endif
#define FP_ZERO [fp, #-240]
#define FP_ZERO_0 [fp, #-240]
@@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
-#ifdef ARM_SOFTFP_ABI
-#define A [fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP r3
+#define OLD_A_SOFTFP [fp, #4 ]
#define B [fp, #8 ]
#define C [fp, #12 ]
#define OLD_LDC [fp, #16 ]
-#else //hard
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#endif
-
+
#define I r0
#define J r1
#define L r2
@@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_M, M
str OLD_N, N
str OLD_K, K
-
-#ifdef ARM_SOFTFP_ABI
- str OLD_ALPHA, ALPHA
-#else //hard
str OLD_A, A
vstr OLD_ALPHA, ALPHA
-#endif
+
sub r3, fp, #128
vstm r3, { s8 - s31} // store floating point registers
diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S
index 8f97644ec..635b1dd13 100644
--- a/kernel/arm/strmm_kernel_4x2_vfp.S
+++ b/kernel/arm/strmm_kernel_4x2_vfp.S
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-276 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP r3
+#define OLD_A_SOFTFP [fp, #4 ]
+#define B [fp, #8 ]
+#define OLD_C [fp, #12 ]
+#define OLD_LDC [fp, #16 ]
+#define OFFSET [fp, #20 ]
+#else
#define B [fp, #4 ]
#define OLD_C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S
index 0dd03ac85..e24d24eba 100644
--- a/kernel/arm/strmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S
@@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHA_SOFTFP r3
+#define OLD_A_SOFTFP [fp, #4 ]
+#define B [fp, #8 ]
+#define C [fp, #12 ]
+#define OLD_LDC [fp, #16 ]
+#define OFFSET [fp, #20 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S
index 352875188..76661da79 100644
--- a/kernel/arm/swap_vfp.S
+++ b/kernel/arm/swap_vfp.S
@@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
+#if !defined(__ARM_PCS_VFP)
+
+#if !defined(COMPLEX)
+
+#if !defined(DOUBLE)
+#define OLD_X [fp, #0 ]
+#define OLD_INC_X [fp, #4 ]
+#define OLD_Y [fp, #8 ]
+#define OLD_INC_Y [fp, #12 ]
+#else
+#define OLD_X [fp, #8 ]
+#define OLD_INC_X [fp, #12]
+#define OLD_Y [fp, #16]
+#define OLD_INC_Y [fp, #20]
+#endif
+
+#else //COMPLEX
+
+#if !defined(DOUBLE)
+#define OLD_X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define OLD_Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#else
+#define OLD_X [fp, #16]
+#define OLD_INC_X [fp, #20]
+#define OLD_Y [fp, #24]
+#define OLD_INC_Y [fp, #28]
+#endif
+
+#endif // !defined(__ARM_PCS_VFP)
+
+#else
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
+#endif
#define N r0
@@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
push {r4 , fp}
add fp, sp, #8
+#if !defined(__ARM_PCS_VFP)
+ ldr X, OLD_X
+#endif
ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y
diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S
index 936ce9f60..43f2c0c0b 100644
--- a/kernel/arm/zdot_vfp.S
+++ b/kernel/arm/zdot_vfp.S
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define N r0
#define X r1
#define INC_X r2
-#define OLD_Y r3
-
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* registers
*******************************************************/
-#define OLD_INC_Y [fp, #4 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_RETURN_ADDR r0
+#define OLD_N r1
+#define OLD_X r2
+#define OLD_INC_X r3
+#define OLD_Y [fp, #0 ]
+#define OLD_INC_Y [fp, #4 ]
+#define RETURN_ADDR r8
+#else
+#define OLD_Y r3
+#define OLD_INC_Y [fp, #0 ]
+#endif
#define I r5
#define Y r6
@@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 5
push {r4 - r9, fp}
- add fp, sp, #24
+ add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
@@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vcvt.f64.f32 d2, s0
vcvt.f64.f32 d3, s0
+#if !defined(__ARM_PCS_VFP)
+ mov RETURN_ADDR, OLD_RETURN_ADDR
+ mov N, OLD_N
+ mov X, OLD_X
+ mov INC_X, OLD_INC_X
+ ldr Y, OLD_Y
+ ldr INC_Y, OLD_INC_Y
+#else
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
-
+#endif
cmp N, #0
ble zdot_kernel_L999
@@ -280,8 +297,11 @@ zdot_kernel_L999:
vadd.f64 d0 , d0, d2
vsub.f64 d1 , d1, d3
#endif
+#if !defined(__ARM_PCS_VFP)
+ vstm RETURN_ADDR, {d0 - d1}
+#endif
- sub sp, fp, #24
+ sub sp, fp, #28
pop {r4 - r9, fp}
bx lr
diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S
index 46507c4d2..53d18b07b 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfp.S
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP [fp, #4]
+#define OLD_ALPHAI_SOFTFP [fp, #12]
+#define OLD_A_SOFTFP [fp, #20 ]
+#define B [fp, #24 ]
+#define C [fp, #28 ]
+#define OLD_LDC [fp, #32 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
+#endif
#define I r0
#define J r1
@@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CN) || defined(CT)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
index 5a99f792f..a9d4eddeb 100644
--- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP [fp, #4]
+#define OLD_ALPHAI_SOFTFP [fp, #12]
+#define OLD_A_SOFTFP [fp, #20 ]
+#define B [fp, #24 ]
+#define C [fp, #28 ]
+#define OLD_LDC [fp, #32 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
+#endif
#define I r0
#define J r1
@@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R1 vmls.f64
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
- #define FMAC_I2 fnmacd
+ #define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT)
@@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubd
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
@@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmacd
+ #define FMAC_R1 vmls.f64
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
- #define FMAC_I2 fnmacd
+ #define FMAC_I1 vmls.f64
+ #define FMAC_I2 vmls.f64
#endif
@@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S
index da9a91043..3e3a1bc07 100644
--- a/kernel/arm/zgemv_n_vfp.S
+++ b/kernel/arm/zgemv_n_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR [fp, #0 ]
+#define OLD_ALPHAI [fp, #8 ]
+#define OLD_A_SOFTFP [fp, #16]
+#define OLD_LDA [fp, #20]
+#define X [fp, #24]
+#define OLD_INC_X [fp, #28]
+#define Y [fp, #32]
+#define OLD_INC_Y [fp, #36]
+#else
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#endif
+
#define OLD_A r3
#define OLD_M r0
@@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble zgemvn_kernel_L999
+#if !defined(__ARM_PCS_VFP)
+ vldr d0, OLD_ALPHAR
+ vldr d1, OLD_ALPHAI
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_A, A
str OLD_M, M
vstr d0 , ALPHA_R
diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S
index 211fa0701..2193083af 100644
--- a/kernel/arm/zgemv_t_vfp.S
+++ b/kernel/arm/zgemv_t_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define STACKSIZE 256
-#define OLD_LDA [fp, #0 ]
-#define X [fp, #4 ]
-#define OLD_INC_X [fp, #8 ]
-#define Y [fp, #12 ]
-#define OLD_INC_Y [fp, #16 ]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR [fp, #0 ]
+#define OLD_ALPHAI [fp, #8 ]
+#define OLD_A_SOFTFP [fp, #16]
+#define OLD_LDA [fp, #20]
+#define X [fp, #24]
+#define OLD_INC_X [fp, #28]
+#define Y [fp, #32]
+#define OLD_INC_Y [fp, #36]
+#else
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#endif
+
#define OLD_A r3
#define OLD_N r1
@@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(CONJ) && !defined(XCONJ)
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp OLD_N, #0
ble zgemvt_kernel_L999
+#if !defined(__ARM_PCS_VFP)
+ vldr d0, OLD_ALPHAR
+ vldr d1, OLD_ALPHAI
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
+
str OLD_A, A
str OLD_N, N
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S
index dc80b17b8..cb6bc050e 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP [fp, #4]
+#define OLD_ALPHAI_SOFTFP [fp, #12]
+#define OLD_A_SOFTFP [fp, #20 ]
+#define B [fp, #24 ]
+#define C [fp, #28 ]
+#define OLD_LDC [fp, #32 ]
+#define OFFSET [fp, #36 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CN) || defined(CT)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
#define KMAC_R fmacd
- #define KMAC_I fnmacd
+ #define KMAC_I vmls.f64
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#else
- #define KMAC_R fnmacd
+ #define KMAC_R vmls.f64
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmacd
+ #define FMAC_I1 vmls.f64
#define FMAC_I2 fmacd
#endif
@@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
index 5a808ccbc..3e6962f06 100644
--- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHA_I [fp, #-272]
#define ALPHA_R [fp, #-280]
+#if !defined(__ARM_PCS_VFP)
+#define OLD_ALPHAR_SOFTFP [fp, #4]
+#define OLD_ALPHAI_SOFTFP [fp, #12]
+#define OLD_A_SOFTFP [fp, #20 ]
+#define B [fp, #24 ]
+#define C [fp, #28 ]
+#define OLD_LDC [fp, #32 ]
+#define OFFSET [fp, #36 ]
+#else
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define OFFSET [fp, #16 ]
+#endif
#define I r0
#define J r1
@@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmuld
- #define FMAC_R2 fnmacd
+ #define FMAC_R1 vnmul.f64
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
- #define FMAC_I2 fnmacd
+ #define FMAC_I2 vmls.f64
#elif defined(CN) || defined(CT)
@@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FMAC_R1 fmuld
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmuld
+ #define FMAC_I1 vnmul.f64
#define FMAC_I2 fmacd
#elif defined(NC) || defined(TC)
@@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_I fsubd
#define FMAC_R1 fmuld
- #define FMAC_R2 fnmacd
+ #define FMAC_R2 vmls.f64
#define FMAC_I1 fmuld
#define FMAC_I2 fmacd
@@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FADD_R fsubd
#define FADD_I faddd
- #define FMAC_R1 fnmuld
+ #define FMAC_R1 vnmul.f64
#define FMAC_R2 fmacd
- #define FMAC_I1 fnmuld
- #define FMAC_I2 fnmacd
+ #define FMAC_I1 vnmul.f64
+ #define FMAC_I2 vmls.f64
#endif
@@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
+#if !defined(__ARM_PCS_VFP)
+ vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
+ vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
+ ldr OLD_A, OLD_A_SOFTFP
+#endif
str OLD_M, M
str OLD_N, N
str OLD_K, K
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index 93ba50660..7d12c9885 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
index b2b1bead1..613c4d286 100644
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
{
__asm__
(
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %5, %2 \n\t"
- "lxvw4x 34, %6, %2 \n\t"
- "lxvw4x 35, %7, %2 \n\t"
- "lxvw4x 36, %8, %2 \n\t"
- "lxvw4x 37, %9, %2 \n\t"
- "lxvw4x 38, %10, %2 \n\t"
- "lxvw4x 39, %11, %2 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %5, %2 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "lxvw4x 34, %6, %2 \n\t"
- "lxvw4x 35, %7, %2 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "lxvw4x 36, %8, %2 \n\t"
- "lxvw4x 37, %9, %2 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
- "lxvw4x 38, %10, %2 \n\t"
- "lxvw4x 39, %11, %2 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
@@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"2: \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
index 1dd03dc88..8d7d0c0b9 100644
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "lxvw4x 32, 0, %4 \n\t"
- "lxvw4x 33, %5, %4 \n\t"
- "lxvw4x 34, %6, %4 \n\t"
- "lxvw4x 35, %7, %4 \n\t"
- "lxvw4x 36, %8, %4 \n\t"
- "lxvw4x 37, %9, %4 \n\t"
- "lxvw4x 38, %10, %4 \n\t"
- "lxvw4x 39, %11, %4 \n\t"
+ "lxvd2x 32, 0, %4 \n\t"
+ "lxvd2x 33, %5, %4 \n\t"
+ "lxvd2x 34, %6, %4 \n\t"
+ "lxvd2x 35, %7, %4 \n\t"
+ "lxvd2x 36, %8, %4 \n\t"
+ "lxvd2x 37, %9, %4 \n\t"
+ "lxvd2x 38, %10, %4 \n\t"
+ "lxvd2x 39, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
- "lxvw4x 40, 0, %4 \n\t"
- "lxvw4x 41, %5, %4 \n\t"
- "lxvw4x 42, %6, %4 \n\t"
- "lxvw4x 43, %7, %4 \n\t"
- "lxvw4x 44, %8, %4 \n\t"
- "lxvw4x 45, %9, %4 \n\t"
- "lxvw4x 46, %10, %4 \n\t"
- "lxvw4x 47, %11, %4 \n\t"
+ "lxvd2x 40, 0, %4 \n\t"
+ "lxvd2x 41, %5, %4 \n\t"
+ "lxvd2x 42, %6, %4 \n\t"
+ "lxvd2x 43, %7, %4 \n\t"
+ "lxvd2x 44, %8, %4 \n\t"
+ "lxvd2x 45, %9, %4 \n\t"
+ "lxvd2x 46, %10, %4 \n\t"
+ "lxvd2x 47, %11, %4 \n\t"
"addi %4, %4, -128 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
- "lxvw4x 49, %5, %3 \n\t"
- "lxvw4x 50, %6, %3 \n\t"
- "lxvw4x 51, %7, %3 \n\t"
- "lxvw4x 0, %8, %3 \n\t"
- "lxvw4x 1, %9, %3 \n\t"
- "lxvw4x 2, %10, %3 \n\t"
- "lxvw4x 3, %11, %3 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+ "lxvd2x 0, %8, %3 \n\t"
+ "lxvd2x 1, %9, %3 \n\t"
+ "lxvd2x 2, %10, %3 \n\t"
+ "lxvd2x 3, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "lxvw4x 4, 0, %3 \n\t"
- "lxvw4x 5, %5, %3 \n\t"
- "lxvw4x 6, %6, %3 \n\t"
- "lxvw4x 7, %7, %3 \n\t"
- "lxvw4x 8, %8, %3 \n\t"
- "lxvw4x 9, %9, %3 \n\t"
- "lxvw4x 10, %10, %3 \n\t"
- "lxvw4x 11, %11, %3 \n\t"
+ "lxvd2x 4, 0, %3 \n\t"
+ "lxvd2x 5, %5, %3 \n\t"
+ "lxvd2x 6, %6, %3 \n\t"
+ "lxvd2x 7, %7, %3 \n\t"
+ "lxvd2x 8, %8, %3 \n\t"
+ "lxvd2x 9, %9, %3 \n\t"
+ "lxvd2x 10, %10, %3 \n\t"
+ "lxvd2x 11, %11, %3 \n\t"
"addi %3, %3, -128 \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n\t"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 48, 0, %4 \n\t"
- "stxvw4x 49, %5, %4 \n\t"
- "stxvw4x 50, %6, %4 \n\t"
- "stxvw4x 51, %7, %4 \n\t"
- "stxvw4x 0, %8, %4 \n\t"
- "stxvw4x 1, %9, %4 \n\t"
- "stxvw4x 2, %10, %4 \n\t"
- "stxvw4x 3, %11, %4 \n\t"
+ "stxvd2x 48, 0, %4 \n\t"
+ "stxvd2x 49, %5, %4 \n\t"
+ "stxvd2x 50, %6, %4 \n\t"
+ "stxvd2x 51, %7, %4 \n\t"
+ "stxvd2x 0, %8, %4 \n\t"
+ "stxvd2x 1, %9, %4 \n\t"
+ "stxvd2x 2, %10, %4 \n\t"
+ "stxvd2x 3, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
- "stxvw4x 4, 0, %4 \n\t"
- "stxvw4x 5, %5, %4 \n\t"
- "stxvw4x 6, %6, %4 \n\t"
- "stxvw4x 7, %7, %4 \n\t"
- "stxvw4x 8, %8, %4 \n\t"
- "stxvw4x 9, %9, %4 \n\t"
- "stxvw4x 10, %10, %4 \n\t"
- "stxvw4x 11, %11, %4 \n\t"
+ "stxvd2x 4, 0, %4 \n\t"
+ "stxvd2x 5, %5, %4 \n\t"
+ "stxvd2x 6, %6, %4 \n\t"
+ "stxvd2x 7, %7, %4 \n\t"
+ "stxvd2x 8, %8, %4 \n\t"
+ "stxvd2x 9, %9, %4 \n\t"
+ "stxvd2x 10, %10, %4 \n\t"
+ "stxvd2x 11, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
index 08a766f80..4bb515de8 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %8, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
- "lxvw4x 42, %9, %2 \n\t"
- "lxvw4x 43, %10, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
- "lxvw4x 44, %11, %2 \n\t"
- "lxvw4x 45, %12, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
- "lxvw4x 46, %13, %2 \n\t"
- "lxvw4x 47, %14, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
index 444a6d4d5..7a54d5e1e 100644
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
{
__asm__
(
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
"addi %3, %3, 128 \n\t"
"addi %2, %2, 128 \n\t"
@@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"2: \n\t"
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n"
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n"
"#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
:
diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
index 7f7ccfac3..bfe100c8b 100644
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
- "lxvw4x 41, %10, %2 \n\t"
- "lxvw4x 49, %10, %3 \n\t"
- "lxvw4x 42, %11, %2 \n\t"
- "lxvw4x 50, %11, %3 \n\t"
- "lxvw4x 43, %12, %2 \n\t"
- "lxvw4x 51, %12, %3 \n\t"
- "lxvw4x 44, %13, %2 \n\t"
- "lxvw4x %x4, %13, %3 \n\t"
- "lxvw4x 45, %14, %2 \n\t"
- "lxvw4x %x5, %14, %3 \n\t"
- "lxvw4x 46, %15, %2 \n\t"
- "lxvw4x %x6, %15, %3 \n\t"
- "lxvw4x 47, %16, %2 \n\t"
- "lxvw4x %x7, %16, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 41, %10, %2 \n\t"
+ "lxvd2x 49, %10, %3 \n\t"
+ "lxvd2x 42, %11, %2 \n\t"
+ "lxvd2x 50, %11, %3 \n\t"
+ "lxvd2x 43, %12, %2 \n\t"
+ "lxvd2x 51, %12, %3 \n\t"
+ "lxvd2x 44, %13, %2 \n\t"
+ "lxvd2x %x4, %13, %3 \n\t"
+ "lxvd2x 45, %14, %2 \n\t"
+ "lxvd2x %x5, %14, %3 \n\t"
+ "lxvd2x 46, %15, %2 \n\t"
+ "lxvd2x %x6, %15, %3 \n\t"
+ "lxvd2x 47, %16, %2 \n\t"
+ "lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
@@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"1: \n\t"
"xvmaddasp 32, 40, 48 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
"xvmaddasp 33, 41, 49 \n\t"
- "lxvw4x 41, %10, %2 \n\t"
- "lxvw4x 49, %10, %3 \n\t"
+ "lxvd2x 41, %10, %2 \n\t"
+ "lxvd2x 49, %10, %3 \n\t"
"xvmaddasp 34, 42, 50 \n\t"
- "lxvw4x 42, %11, %2 \n\t"
- "lxvw4x 50, %11, %3 \n\t"
+ "lxvd2x 42, %11, %2 \n\t"
+ "lxvd2x 50, %11, %3 \n\t"
"xvmaddasp 35, 43, 51 \n\t"
- "lxvw4x 43, %12, %2 \n\t"
- "lxvw4x 51, %12, %3 \n\t"
+ "lxvd2x 43, %12, %2 \n\t"
+ "lxvd2x 51, %12, %3 \n\t"
"xvmaddasp 36, 44, %x4 \n\t"
- "lxvw4x 44, %13, %2 \n\t"
- "lxvw4x %x4, %13, %3 \n\t"
+ "lxvd2x 44, %13, %2 \n\t"
+ "lxvd2x %x4, %13, %3 \n\t"
"xvmaddasp 37, 45, %x5 \n\t"
- "lxvw4x 45, %14, %2 \n\t"
- "lxvw4x %x5, %14, %3 \n\t"
+ "lxvd2x 45, %14, %2 \n\t"
+ "lxvd2x %x5, %14, %3 \n\t"
"xvmaddasp 38, 46, %x6 \n\t"
- "lxvw4x 46, %15, %2 \n\t"
- "lxvw4x %x6, %15, %3 \n\t"
+ "lxvd2x 46, %15, %2 \n\t"
+ "lxvd2x %x6, %15, %3 \n\t"
"xvmaddasp 39, 47, %x7 \n\t"
- "lxvw4x 47, %16, %2 \n\t"
- "lxvw4x %x7, %16, %3 \n\t"
+ "lxvd2x 47, %16, %2 \n\t"
+ "lxvd2x %x7, %16, %3 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
index 0a18c16e0..6eecb60a1 100644
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xscvdpspn 37, %x14 \n\t" // load s to all words
"xxspltw 37, 37, 0 \n\t"
- "lxvw4x 32, 0, %3 \n\t" // load x
- "lxvw4x 33, %15, %3 \n\t"
- "lxvw4x 34, %16, %3 \n\t"
- "lxvw4x 35, %17, %3 \n\t"
+ "lxvd2x 32, 0, %3 \n\t" // load x
+ "lxvd2x 33, %15, %3 \n\t"
+ "lxvd2x 34, %16, %3 \n\t"
+ "lxvd2x 35, %17, %3 \n\t"
- "lxvw4x 48, 0, %4 \n\t" // load y
- "lxvw4x 49, %15, %4 \n\t"
- "lxvw4x 50, %16, %4 \n\t"
- "lxvw4x 51, %17, %4 \n\t"
+ "lxvd2x 48, 0, %4 \n\t" // load y
+ "lxvd2x 49, %15, %4 \n\t"
+ "lxvd2x 50, %16, %4 \n\t"
+ "lxvd2x 51, %17, %4 \n\t"
"addi %3, %3, 64 \n\t"
"addi %4, %4, 64 \n\t"
@@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 45, 33, 37 \n\t"
- "lxvw4x 32, 0, %3 \n\t" // load x
- "lxvw4x 33, %15, %3 \n\t"
+ "lxvd2x 32, 0, %3 \n\t" // load x
+ "lxvd2x 33, %15, %3 \n\t"
"xvmulsp 46, 34, 37 \n\t"
"xvmulsp 47, 35, 37 \n\t"
- "lxvw4x 34, %16, %3 \n\t"
- "lxvw4x 35, %17, %3 \n\t"
+ "lxvd2x 34, %16, %3 \n\t"
+ "lxvd2x 35, %17, %3 \n\t"
"xvmulsp %x9, 48, 37 \n\t" // s * y
"xvmulsp %x10, 49, 37 \n\t"
- "lxvw4x 48, 0, %4 \n\t" // load y
- "lxvw4x 49, %15, %4 \n\t"
+ "lxvd2x 48, 0, %4 \n\t" // load y
+ "lxvd2x 49, %15, %4 \n\t"
"xvmulsp %x11, 50, 37 \n\t"
"xvmulsp %x12, 51, 37 \n\t"
- "lxvw4x 50, %16, %4 \n\t"
- "lxvw4x 51, %17, %4 \n\t"
+ "lxvd2x 50, %16, %4 \n\t"
+ "lxvd2x 51, %17, %4 \n\t"
"xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
"xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
@@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
- "stxvw4x 40, 0, %3 \n\t" // store x
- "stxvw4x 41, %15, %3 \n\t"
- "stxvw4x 42, %16, %3 \n\t"
- "stxvw4x 43, %17, %3 \n\t"
+ "stxvd2x 40, 0, %3 \n\t" // store x
+ "stxvd2x 41, %15, %3 \n\t"
+ "stxvd2x 42, %16, %3 \n\t"
+ "stxvd2x 43, %17, %3 \n\t"
- "stxvw4x %x5, 0, %4 \n\t" // store y
- "stxvw4x %x6, %15, %4 \n\t"
- "stxvw4x %x7, %16, %4 \n\t"
- "stxvw4x %x8, %17, %4 \n\t"
+ "stxvd2x %x5, 0, %4 \n\t" // store y
+ "stxvd2x %x6, %15, %4 \n\t"
+ "stxvd2x %x7, %16, %4 \n\t"
+ "stxvd2x %x8, %17, %4 \n\t"
"addi %3, %3, 128 \n\t"
"addi %4, %4, 128 \n\t"
@@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
"xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
- "stxvw4x 40, 0, %3 \n\t" // store x
- "stxvw4x 41, %15, %3 \n\t"
- "stxvw4x 42, %16, %3 \n\t"
- "stxvw4x 43, %17, %3 \n\t"
+ "stxvd2x 40, 0, %3 \n\t" // store x
+ "stxvd2x 41, %15, %3 \n\t"
+ "stxvd2x 42, %16, %3 \n\t"
+ "stxvd2x 43, %17, %3 \n\t"
- "stxvw4x %x5, 0, %4 \n\t" // store y
- "stxvw4x %x6, %15, %4 \n\t"
- "stxvw4x %x7, %16, %4 \n\t"
- "stxvw4x %x8, %17, %4 \n"
+ "stxvd2x %x5, 0, %4 \n\t" // store y
+ "stxvd2x %x6, %15, %4 \n\t"
+ "stxvd2x %x7, %16, %4 \n\t"
+ "stxvd2x %x8, %17, %4 \n"
"#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
"#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
index 49862a329..058ff3399 100644
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xscvdpspn %x3, %x3 \n\t"
"xxspltw %x3, %x3, 0 \n\t"
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %4, %2 \n\t"
- "lxvw4x 34, %5, %2 \n\t"
- "lxvw4x 35, %6, %2 \n\t"
- "lxvw4x 36, %7, %2 \n\t"
- "lxvw4x 37, %8, %2 \n\t"
- "lxvw4x 38, %9, %2 \n\t"
- "lxvw4x 39, %10, %2 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %4, %2 \n\t"
+ "lxvd2x 34, %5, %2 \n\t"
+ "lxvd2x 35, %6, %2 \n\t"
+ "lxvd2x 36, %7, %2 \n\t"
+ "lxvd2x 37, %8, %2 \n\t"
+ "lxvd2x 38, %9, %2 \n\t"
+ "lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"
@@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t"
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %4, %2 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %4, %2 \n\t"
"xvmulsp 42, 34, %x3 \n\t"
"xvmulsp 43, 35, %x3 \n\t"
- "lxvw4x 34, %5, %2 \n\t"
- "lxvw4x 35, %6, %2 \n\t"
+ "lxvd2x 34, %5, %2 \n\t"
+ "lxvd2x 35, %6, %2 \n\t"
"xvmulsp 44, 36, %x3 \n\t"
"xvmulsp 45, 37, %x3 \n\t"
- "lxvw4x 36, %7, %2 \n\t"
- "lxvw4x 37, %8, %2 \n\t"
+ "lxvd2x 36, %7, %2 \n\t"
+ "lxvd2x 37, %8, %2 \n\t"
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
- "lxvw4x 38, %9, %2 \n\t"
- "lxvw4x 39, %10, %2 \n\t"
+ "lxvd2x 38, %9, %2 \n\t"
+ "lxvd2x 39, %10, %2 \n\t"
"addi %2, %2, -128 \n\t"
- "stxvw4x 40, 0, %2 \n\t"
- "stxvw4x 41, %4, %2 \n\t"
- "stxvw4x 42, %5, %2 \n\t"
- "stxvw4x 43, %6, %2 \n\t"
- "stxvw4x 44, %7, %2 \n\t"
- "stxvw4x 45, %8, %2 \n\t"
- "stxvw4x 46, %9, %2 \n\t"
- "stxvw4x 47, %10, %2 \n\t"
+ "stxvd2x 40, 0, %2 \n\t"
+ "stxvd2x 41, %4, %2 \n\t"
+ "stxvd2x 42, %5, %2 \n\t"
+ "stxvd2x 43, %6, %2 \n\t"
+ "stxvd2x 44, %7, %2 \n\t"
+ "stxvd2x 45, %8, %2 \n\t"
+ "stxvd2x 46, %9, %2 \n\t"
+ "stxvd2x 47, %10, %2 \n\t"
"addi %2, %2, 256 \n\t"
@@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"xvmulsp 46, 38, %x3 \n\t"
"xvmulsp 47, 39, %x3 \n\t"
- "stxvw4x 40, 0, %2 \n\t"
- "stxvw4x 41, %4, %2 \n\t"
- "stxvw4x 42, %5, %2 \n\t"
- "stxvw4x 43, %6, %2 \n\t"
- "stxvw4x 44, %7, %2 \n\t"
- "stxvw4x 45, %8, %2 \n\t"
- "stxvw4x 46, %9, %2 \n\t"
- "stxvw4x 47, %10, %2 \n"
+ "stxvd2x 40, 0, %2 \n\t"
+ "stxvd2x 41, %4, %2 \n\t"
+ "stxvd2x 42, %5, %2 \n\t"
+ "stxvd2x 43, %6, %2 \n\t"
+ "stxvd2x 44, %7, %2 \n\t"
+ "stxvd2x 45, %8, %2 \n\t"
+ "stxvd2x 46, %9, %2 \n\t"
+ "stxvd2x 47, %10, %2 \n"
"#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
:
@@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
".p2align 5 \n"
"1: \n\t"
- "stxvw4x %x3, 0, %2 \n\t"
- "stxvw4x %x3, %4, %2 \n\t"
- "stxvw4x %x3, %5, %2 \n\t"
- "stxvw4x %x3, %6, %2 \n\t"
- "stxvw4x %x3, %7, %2 \n\t"
- "stxvw4x %x3, %8, %2 \n\t"
- "stxvw4x %x3, %9, %2 \n\t"
- "stxvw4x %x3, %10, %2 \n\t"
+ "stxvd2x %x3, 0, %2 \n\t"
+ "stxvd2x %x3, %4, %2 \n\t"
+ "stxvd2x %x3, %5, %2 \n\t"
+ "stxvd2x %x3, %6, %2 \n\t"
+ "stxvd2x %x3, %7, %2 \n\t"
+ "stxvd2x %x3, %8, %2 \n\t"
+ "stxvd2x %x3, %9, %2 \n\t"
+ "stxvd2x %x3, %10, %2 \n\t"
"addi %2, %2, 128 \n\t"
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
index d44f16765..cfefdd6ef 100644
--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
".p2align 5 \n"
"1: \n\t"
- "lxvw4x 32, 0, %4 \n\t"
- "lxvw4x 33, %5, %4 \n\t"
- "lxvw4x 34, %6, %4 \n\t"
- "lxvw4x 35, %7, %4 \n\t"
- "lxvw4x 36, %8, %4 \n\t"
- "lxvw4x 37, %9, %4 \n\t"
- "lxvw4x 38, %10, %4 \n\t"
- "lxvw4x 39, %11, %4 \n\t"
+ "lxvd2x 32, 0, %4 \n\t"
+ "lxvd2x 33, %5, %4 \n\t"
+ "lxvd2x 34, %6, %4 \n\t"
+ "lxvd2x 35, %7, %4 \n\t"
+ "lxvd2x 36, %8, %4 \n\t"
+ "lxvd2x 37, %9, %4 \n\t"
+ "lxvd2x 38, %10, %4 \n\t"
+ "lxvd2x 39, %11, %4 \n\t"
- "lxvw4x 40, 0, %3 \n\t"
- "lxvw4x 41, %5, %3 \n\t"
- "lxvw4x 42, %6, %3 \n\t"
- "lxvw4x 43, %7, %3 \n\t"
- "lxvw4x 44, %8, %3 \n\t"
- "lxvw4x 45, %9, %3 \n\t"
- "lxvw4x 46, %10, %3 \n\t"
- "lxvw4x 47, %11, %3 \n\t"
+ "lxvd2x 40, 0, %3 \n\t"
+ "lxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 42, %6, %3 \n\t"
+ "lxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 44, %8, %3 \n\t"
+ "lxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 46, %10, %3 \n\t"
+ "lxvd2x 47, %11, %3 \n\t"
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
"addi %3, %3, 128 \n\t"
- "stxvw4x 40, 0, %4 \n\t"
- "stxvw4x 41, %5, %4 \n\t"
- "stxvw4x 42, %6, %4 \n\t"
- "stxvw4x 43, %7, %4 \n\t"
- "stxvw4x 44, %8, %4 \n\t"
- "stxvw4x 45, %9, %4 \n\t"
- "stxvw4x 46, %10, %4 \n\t"
- "stxvw4x 47, %11, %4 \n\t"
+ "stxvd2x 40, 0, %4 \n\t"
+ "stxvd2x 41, %5, %4 \n\t"
+ "stxvd2x 42, %6, %4 \n\t"
+ "stxvd2x 43, %7, %4 \n\t"
+ "stxvd2x 44, %8, %4 \n\t"
+ "stxvd2x 45, %9, %4 \n\t"
+ "stxvd2x 46, %10, %4 \n\t"
+ "stxvd2x 47, %11, %4 \n\t"
"addi %4, %4, 128 \n\t"
diff --git a/relapack/LICENSE b/relapack/LICENSE
new file mode 100644
index 000000000..edeb4046e
--- /dev/null
+++ b/relapack/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Elmar Peise
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/relapack/Makefile b/relapack/Makefile
new file mode 100644
index 000000000..ddf101bd1
--- /dev/null
+++ b/relapack/Makefile
@@ -0,0 +1,98 @@
+TOPDIR = ..
+include $(TOPDIR)/Makefile.system
+
+
+
+SRC = $(wildcard src/*.c)
+
+SRC1 = \
+ src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \
+ src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \
+ src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \
+ src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c
+
+SRC2 = \
+ src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
+ src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
+ src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
+ src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
+ src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
+ src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \
+ src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \
+ src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \
+ src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c
+
+SRCX = \
+ src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
+ src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
+ src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
+ src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
+ src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
+ src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \
+ src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \
+ src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \
+ src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c
+
+OBJS1 = $(SRC1:%.c=%.$(SUFFIX))
+OBJS2 = $(SRC2:%.c=%.o)
+OBJS = $(OBJS1) $(OBJS2)
+
+TEST_SUITS = \
+ slauum dlauum clauum zlauum \
+ spotrf dpotrf cpotrf zpotrf \
+ spbtrf dpbtrf cpbtrf zpbtrf \
+ ssygst dsygst chegst zhegst \
+ ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
+ sgetrf dgetrf cgetrf zgetrf \
+ sgbtrf dgbtrf cgbtrf zgbtrf \
+ strsyl dtrsyl ctrsyl ztrsyl \
+ stgsyl dtgsyl ctgsyl ztgsyl \
+ sgemmt dgemmt cgemmt zgemmt
+TESTS = $(TEST_SUITS:%=test/%.pass) # dummies
+TEST_EXES = $(TEST_SUITS:%=test/%.x)
+
+LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm
+
+.SECONDARY: $(TEST_EXES)
+.PHONY: test
+
+# ReLAPACK compilation
+
+libs: $(OBJS)
+ @echo "Building ReLAPACK library $(LIBNAME)"
+ $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS)
+ $(RANLIB) $(TOPDIR)/$(LIBNAME)
+
+%.$(SUFFIX): %.c config.h
+ $(CC) $(CFLAGS) -c $< -o $@
+
+%.o: %.c config.h
+ $(CC) $(CFLAGS) -c $< -o $@
+
+
+# ReLAPACK testing
+
+test: $(TEST_EXES) $(TESTS)
+ @echo "passed all tests"
+
+test/%.pass: test/%.x
+ @echo -n $*:
+ @./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)
+
+test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+ $(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+ $(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+ $(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
+ $(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)
+
+
+# cleaning up
+
+clean:
+ rm -f $(OBJS) test/util.$(SUFFIX) test/*.x
diff --git a/relapack/README.md b/relapack/README.md
new file mode 100644
index 000000000..1947c1748
--- /dev/null
+++ b/relapack/README.md
@@ -0,0 +1,68 @@
+ReLAPACK
+========
+
+[![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK)
+
+[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK)
+
+ReLAPACK offers a collection of recursive algorithms for many of LAPACK's
+compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK
+integrates effortlessly into existing application codes. ReLAPACK's routines
+not only outperform the reference LAPACK but also improve upon the performance
+of tuned implementations, such as OpenBLAS and MKL.
+
+
+Coverage
+--------
+For a detailed list of covered operations and an overview of operations to which
+recursion is not efficiently applicable, see [coverage.md](coverage.md).
+
+
+Installation
+------------
+To compile with the default configuration, simply run `make` to create the
+library `librelapack.a`.
+
+### Linking with MKL
+Note that to link with MKL, you currently need to set the flag
+`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and
+`ztrsyl`. For further configuration options see [config.md](config.md).
+
+
+### Dependencies
+ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked
+kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized
+and machine specific implementations of these libraries, which are commonly
+provided by hardware vendors or available as open source (e.g.,
+[OpenBLAS](http://www.openblas.net/)).
+
+
+Testing
+-------
+ReLAPACK's test suite compares its routines numerically with LAPACK's
+counterparts. To set up the tests (located int `test/`) you need to specify
+link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then
+`make test` runs the tests. For details on the performed tests, see
+[test/README.md](test/README.md).
+
+
+Examples
+--------
+Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the
+covered routines applies directly to ReLAPACK. A few separate examples are
+given in `examples/`. For details, see [examples/README.md](examples/README.md).
+
+
+Citing
+------
+When referencing ReLAPACK, please cite the preprint of the paper
+[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763):
+
+ @article{relapack,
+ author = {Elmar Peise and Paolo Bientinesi},
+ title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection},
+ journal = {CoRR},
+ volume = {abs/1602.06763},
+ year = {2016},
+ url = {http://arxiv.org/abs/1602.06763},
+ }
diff --git a/relapack/config.h b/relapack/config.h
new file mode 100644
index 000000000..9113a712d
--- /dev/null
+++ b/relapack/config.h
@@ -0,0 +1,208 @@
+#ifndef RELAPACK_CONFIG_H
+#define RELAPACK_CONFIG_H
+
+// ReLAPACK configuration file.
+// See also config.md
+
+
+///////////////////////////////
+// BLAS/LAPACK obect symbols //
+///////////////////////////////
+
+// BLAS routines linked against have a trailing underscore
+#define BLAS_UNDERSCORE 1
+// LAPACK routines linked against have a trailing underscore
+#define LAPACK_UNDERSCORE BLAS_UNDERSCORE
+
+// Complex BLAS/LAPACK routines return their result in the first argument
+// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to
+// work.
+#define COMPLEX_FUNCTIONS_AS_ROUTINES 0
+#ifdef F_INTERFACE_INTEL
+#define COMPLEX_FUNCTIONS_AS_ROUTINES 1
+#endif
+#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
+#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
+
+// The BLAS-like extension xgemmt is provided by an external library.
+#define HAVE_XGEMMT 0
+
+
+////////////////////////////
+// Use malloc in ReLAPACK //
+////////////////////////////
+
+#define ALLOW_MALLOC 1
+// allow malloc in xsygst for improved performance
+#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
+// allow malloc in xsytrf if the passed work buffer is too small
+#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
+
+
+////////////////////////////////
+// LAPACK routine replacement //
+////////////////////////////////
+// The following macros specify which routines are included in the library under
+// LAPACK's symbol names: 1 included, 0 not included
+
+#define INCLUDE_ALL 1
+
+#define INCLUDE_XLAUUM INCLUDE_ALL
+#define INCLUDE_SLAUUM INCLUDE_XLAUUM
+#define INCLUDE_DLAUUM INCLUDE_XLAUUM
+#define INCLUDE_CLAUUM INCLUDE_XLAUUM
+#define INCLUDE_ZLAUUM INCLUDE_XLAUUM
+
+#define INCLUDE_XSYGST INCLUDE_ALL
+#define INCLUDE_SSYGST INCLUDE_XSYGST
+#define INCLUDE_DSYGST INCLUDE_XSYGST
+#define INCLUDE_CHEGST INCLUDE_XSYGST
+#define INCLUDE_ZHEGST INCLUDE_XSYGST
+
+#define INCLUDE_XTRTRI INCLUDE_ALL
+#define INCLUDE_STRTRI INCLUDE_XTRTRI
+#define INCLUDE_DTRTRI INCLUDE_XTRTRI
+#define INCLUDE_CTRTRI INCLUDE_XTRTRI
+#define INCLUDE_ZTRTRI INCLUDE_XTRTRI
+
+#define INCLUDE_XPOTRF INCLUDE_ALL
+#define INCLUDE_SPOTRF INCLUDE_XPOTRF
+#define INCLUDE_DPOTRF INCLUDE_XPOTRF
+#define INCLUDE_CPOTRF INCLUDE_XPOTRF
+#define INCLUDE_ZPOTRF INCLUDE_XPOTRF
+
+#define INCLUDE_XPBTRF INCLUDE_ALL
+#define INCLUDE_SPBTRF INCLUDE_XPBTRF
+#define INCLUDE_DPBTRF INCLUDE_XPBTRF
+#define INCLUDE_CPBTRF INCLUDE_XPBTRF
+#define INCLUDE_ZPBTRF INCLUDE_XPBTRF
+
+#define INCLUDE_XSYTRF INCLUDE_ALL
+#define INCLUDE_SSYTRF INCLUDE_XSYTRF
+#define INCLUDE_DSYTRF INCLUDE_XSYTRF
+#define INCLUDE_CSYTRF INCLUDE_XSYTRF
+#define INCLUDE_CHETRF INCLUDE_XSYTRF
+#define INCLUDE_ZSYTRF INCLUDE_XSYTRF
+#define INCLUDE_ZHETRF INCLUDE_XSYTRF
+#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF
+#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF
+#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF
+#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF
+#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF
+#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF
+
+#define INCLUDE_XGETRF INCLUDE_ALL
+#define INCLUDE_SGETRF INCLUDE_XGETRF
+#define INCLUDE_DGETRF INCLUDE_XGETRF
+#define INCLUDE_CGETRF INCLUDE_XGETRF
+#define INCLUDE_ZGETRF INCLUDE_XGETRF
+
+#define INCLUDE_XGBTRF INCLUDE_ALL
+#define INCLUDE_SGBTRF INCLUDE_XGBTRF
+#define INCLUDE_DGBTRF INCLUDE_XGBTRF
+#define INCLUDE_CGBTRF INCLUDE_XGBTRF
+#define INCLUDE_ZGBTRF INCLUDE_XGBTRF
+
+#define INCLUDE_XTRSYL INCLUDE_ALL
+#define INCLUDE_STRSYL INCLUDE_XTRSYL
+#define INCLUDE_DTRSYL INCLUDE_XTRSYL
+#define INCLUDE_CTRSYL INCLUDE_XTRSYL
+#define INCLUDE_ZTRSYL INCLUDE_XTRSYL
+
+#define INCLUDE_XTGSYL INCLUDE_ALL
+#define INCLUDE_STGSYL INCLUDE_XTGSYL
+#define INCLUDE_DTGSYL INCLUDE_XTGSYL
+#define INCLUDE_CTGSYL INCLUDE_XTGSYL
+#define INCLUDE_ZTGSYL INCLUDE_XTGSYL
+
+#define INCLUDE_XGEMMT 0
+#define INCLUDE_SGEMMT INCLUDE_XGEMMT
+#define INCLUDE_DGEMMT INCLUDE_XGEMMT
+#define INCLUDE_CGEMMT INCLUDE_XGEMMT
+#define INCLUDE_ZGEMMT INCLUDE_XGEMMT
+
+
+/////////////////////
+// crossover sizes //
+/////////////////////
+
+// default crossover size
+#define CROSSOVER 24
+
+// individual crossover sizes
+#define CROSSOVER_XLAUUM CROSSOVER
+#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM
+#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM
+
+#define CROSSOVER_XSYGST CROSSOVER
+#define CROSSOVER_SSYGST CROSSOVER_XSYGST
+#define CROSSOVER_DSYGST CROSSOVER_XSYGST
+#define CROSSOVER_CHEGST CROSSOVER_XSYGST
+#define CROSSOVER_ZHEGST CROSSOVER_XSYGST
+
+#define CROSSOVER_XTRTRI CROSSOVER
+#define CROSSOVER_STRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI
+#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI
+
+#define CROSSOVER_XPOTRF CROSSOVER
+#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF
+#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF
+
+#define CROSSOVER_XPBTRF CROSSOVER
+#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF
+#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF
+
+#define CROSSOVER_XSYTRF CROSSOVER
+#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_CHETRF CROSSOVER_XSYTRF
+#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF
+#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF
+#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF
+#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF
+#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF
+#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF
+#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF
+#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF
+
+#define CROSSOVER_XGETRF CROSSOVER
+#define CROSSOVER_SGETRF CROSSOVER_XGETRF
+#define CROSSOVER_DGETRF CROSSOVER_XGETRF
+#define CROSSOVER_CGETRF CROSSOVER_XGETRF
+#define CROSSOVER_ZGETRF CROSSOVER_XGETRF
+
+#define CROSSOVER_XGBTRF CROSSOVER
+#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF
+#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF
+
+#define CROSSOVER_XTRSYL CROSSOVER
+#define CROSSOVER_STRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL
+#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL
+
+#define CROSSOVER_XTGSYL CROSSOVER
+#define CROSSOVER_STGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL
+#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL
+
+// sytrf helper routine
+#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF
+#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT
+#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT
+
+#endif /* RELAPACK_CONFIG_H */
diff --git a/relapack/config.md b/relapack/config.md
new file mode 100644
index 000000000..ea14be16a
--- /dev/null
+++ b/relapack/config.md
@@ -0,0 +1,87 @@
+RELAPACK Configuration
+======================
+
+ReLAPACK has two configuration files: `make.inc`, which is included by the
+Makefile, and `config.h` which is included in the source files.
+
+
+Build and Testing Environment
+-----------------------------
+The build environment (compiler and flags) and the test configuration (linker
+flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size
+and error bounds are defined in `test/config.h`.
+
+The library `librelapack.a` is compiled by invoking `make`. The tests are
+performed by either `make test` or calling `make` in the test folder.
+
+
+BLAS/LAPACK complex function interfaces
+---------------------------------------
+For BLAS and LAPACK functions that return a complex number, there exist two
+conflicting (FORTRAN compiler dependent) calling conventions: either the result
+is returned as a `struct` of two floating point numbers or an additional first
+argument with a pointer to such a `struct` is used. By default ReLAPACK uses
+the former (which is what gfortran uses), but it can switch to the latter by
+setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK
+specific counterparts) to `1` in `config.h`.
+
+**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.**
+
+(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases
+will segfault or return errors on the order of 1 or larger.)
+
+
+BLAS extension `xgemmt`
+-----------------------
+The LDL decompositions require a general matrix-matrix product that updates only
+a triangular matrix called `xgemmt`. If the BLAS implementation linked against
+provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`;
+otherwise, ReLAPACK uses its own recursive implementation of these kernels.
+
+`xgemmt` is provided by MKL.
+
+
+Routine Selection
+-----------------
+ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the
+corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to
+`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g.,
+`dgetrf_`). By default, wrappers for all routines are enabled.
+
+
+Crossover Size
+--------------
+The crossover size determines below which matrix sizes ReLAPACK's recursive
+algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3
+routines. The crossover size is set in `config.h` and can be chosen either
+globally for the entire library, by operation, or individually by routine.
+
+
+Allowing Temporary Buffers
+--------------------------
+Two of ReLAPACK's routines make use of temporary buffers, which are allocated
+and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine
+specific counterparts) to 0 in `config.h` will disable these buffers. The
+affected routines are:
+
+ * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in
+ LAPACK, this size can be queried by setting `lWork = -1` and the passed
+ buffer will be used if it is large enough; only if it is not, a local buffer
+ will be allocated.
+
+ The advantage of this mechanism is that ReLAPACK will seamlessly work even
+ with codes that statically provide too little memory instead of breaking
+ them.
+
+ * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem
+ to standard form can use an auxiliary buffer of size n^2 / 2 to avoid
+ redundant computations. It thereby performs about 30% less FLOPs than
+ LAPACK.
+
+
+FORTRAN symbol names
+--------------------
+ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces.
+Since these libraries usually have an underscore to their symbol names, ReLAPACK
+has configuration switches in `config.h` to adjust the corresponding routine
+names.
diff --git a/relapack/coverage.md b/relapack/coverage.md
new file mode 100644
index 000000000..8406b2078
--- /dev/null
+++ b/relapack/coverage.md
@@ -0,0 +1,212 @@
+Coverage of ReLAPACK
+====================
+
+This file lists all LAPACK compute routines that are covered by recursive
+algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which
+are not (yet) part of ReLAPACK.
+
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)*
+
+- [List of covered LAPACK routines](#list-of-covered-lapack-routines)
+ - [`xlauum`](#xlauum)
+ - [`xsygst`](#xsygst)
+ - [`xtrtri`](#xtrtri)
+ - [`xpotrf`](#xpotrf)
+ - [`xpbtrf`](#xpbtrf)
+ - [`xsytrf`](#xsytrf)
+ - [`xgetrf`](#xgetrf)
+ - [`xgbtrf`](#xgbtrf)
+ - [`xtrsyl`](#xtrsyl)
+ - [`xtgsyl`](#xtgsyl)
+- [Covered BLAS extension](#covered-blas-extension)
+ - [`xgemmt`](#xgemmt)
+- [Not covered yet](#not-covered-yet)
+ - [`xpstrf`](#xpstrf)
+- [Not covered: extra FLOPs](#not-covered-extra-flops)
+ - [QR decomposition (and related)](#qr-decomposition-and-related)
+ - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal)
+ - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal)
+ - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg)
+
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+
+
+List of covered LAPACK routines
+-------------------------------
+
+### `xlauum`
+Multiplication of a triangular matrix with its (complex conjugate) transpose,
+resulting in a symmetric (Hermitian) matrix.
+
+Routines: `slauum`, `dlauum`, `clauum`, `zlauum`
+
+Operations:
+* A = L^T L
+* A = U U^T
+
+### `xsygst`
+Simultaneous two-sided multiplication of a symmetric matrix with a triangular
+matrix and its transpose
+
+Routines: `ssygst`, `dsygst`, `chegst`, `zhegst`
+
+Operations:
+* A = inv(L) A inv(L^T)
+* A = inv(U^T) A inv(U)
+* A = L^T A L
+* A = U A U^T
+
+### `xtrtri`
+Inversion of a triangular matrix
+
+Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri`
+
+Operations:
+* L = inv(L)
+* U = inv(U)
+
+### `xpotrf`
+Cholesky decomposition of a symmetric (Hermitian) positive definite matrix
+
+Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf`
+
+Operations:
+* L L^T = A
+* U^T U = A
+
+### `xpbtrf`
+Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix
+
+Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf`
+
+Operations:
+* L L^T = A
+* U^T U = A
+
+### `xsytrf`
+LDL decomposition of a symmetric (or Hermitian) matrix
+
+Routines:
+* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`,
+* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`,
+ `zhetrf_rook`
+
+Operations:
+* L D L^T = A
+* U^T D U = A
+
+### `xgetrf`
+LU decomposition of a general matrix with pivoting
+
+Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf`
+
+Operation: P L U = A
+
+### `xgbtrf`
+LU decomposition of a general banded matrix with pivoting
+
+Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf`
+
+Operation: L U = A
+
+### `xtrsyl`
+Solution of the quasi-triangular Sylvester equation
+
+Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl`
+
+Operations:
+* A X + B Y = C -> X
+* A^T X + B Y = C -> X
+* A X + B^T Y = C -> X
+* A^T X + B^T Y = C -> X
+* A X - B Y = C -> X
+* A^T X - B Y = C -> X
+* A X - B^T Y = C -> X
+* A^T X - B^T Y = C -> X
+
+### `xtgsyl`
+Solution of the generalized Sylvester equations
+
+Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl`
+
+Operations:
+* A R - L B = C, D R - L E = F -> L, R
+* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R
+
+
+Covered BLAS extension
+----------------------
+
+### `xgemmt`
+Matrix-matrix product updating only a triangular part of the result
+
+Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt`
+
+Operations:
+* C = alpha A B + beta C
+* C = alpha A B^T + beta C
+* C = alpha A^T B + beta C
+* C = alpha A^T B^T + beta C
+
+
+Not covered yet
+---------------
+The following operation is implemented as a blocked algorithm in LAPACK but
+currently not yet covered in ReLAPACK as a recursive algorithm
+
+### `xpstrf`
+Cholesky decomposition of a positive semi-definite matrix with complete pivoting.
+
+Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf`
+
+Operations:
+* P L L^T P^T = A
+* P U^T U P^T = A
+
+
+Not covered: extra FLOPs
+------------------------
+The following routines are not covered because recursive variants would require
+considerably more FLOPs or operate on banded matrices.
+
+### QR decomposition (and related)
+Routines:
+* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf`
+* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf`
+* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf`
+* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf`
+* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf`
+
+Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A
+
+Routines for multiplication with Q:
+* `sormqr`, `dormqr`, `cunmqr`, `zunmqr`
+* `sormrq`, `dormrq`, `cunmrq`, `zunmrq`
+* `sormql`, `dormql`, `cunmql`, `zunmql`
+* `sormlq`, `dormlq`, `cunmlq`, `zunmlq`
+* `sormrz`, `dormrz`, `cunmrz`, `zunmrz`
+
+Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T
+
+Routines for construction of Q:
+* `sorgqr`, `dorgqr`, `cungqr`, `zungqr`
+* `sorgrq`, `dorgrq`, `cungrq`, `zungrq`
+* `sorgql`, `dorgql`, `cungql`, `zungql`
+* `sorglq`, `dorglq`, `cunglq`, `zunglq`
+
+### Symmetric reduction to tridiagonal
+Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd`
+
+Operation: Q T Q^T = A
+
+### Symmetric reduction to bidiagonal
+Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd`
+
+Operation: Q T P^T = A
+
+### Reduction to upper Hessenberg
+Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd`
+
+Operation: Q H Q^T = A
diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h
new file mode 100644
index 000000000..e421f352b
--- /dev/null
+++ b/relapack/inc/relapack.h
@@ -0,0 +1,67 @@
+#ifndef RELAPACK_H
+#define RELAPACK_H
+
+void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
+void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
+void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
+void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
+void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
+void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
+void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
+void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
+void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
+void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);
+
+void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
+void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
+void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
+void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);
+
+void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+
+void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);
+
+void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+
+void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+
+void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+
+void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+
+void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+#endif /* RELAPACK_H */
diff --git a/relapack/src/blas.h b/relapack/src/blas.h
new file mode 100644
index 000000000..7441c1033
--- /dev/null
+++ b/relapack/src/blas.h
@@ -0,0 +1,61 @@
+#ifndef BLAS_H
+#define BLAS_H
+
+extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
+extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
+extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
+extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);
+
+extern void BLAS(sscal)(const int *, const float *, float *, const int *);
+extern void BLAS(dscal)(const int *, const double *, double *, const int *);
+extern void BLAS(cscal)(const int *, const float *, float *, const int *);
+extern void BLAS(zscal)(const int *, const double *, double *, const int *);
+
+extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+
+extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+
+extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
+extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
+
+extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
+extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
+extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
+
+extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
+extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
+
+#if HAVE_XGEMMT
+extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
+extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
+#endif
+
+#endif /* BLAS_H */
diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c
new file mode 100644
index 000000000..90b2c8789
--- /dev/null
+++ b/relapack/src/cgbtrf.c
@@ -0,0 +1,230 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
+ const int *, float *, const int *, int *, float *, const int *, float *,
+ const int *, int *);
+
+
+/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's cgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
+ * */
+void RELAPACK_cgbtrf(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kl < 0)
+ *info = -3;
+ else if (*ku < 0)
+ *info = -4;
+ else if (*ldAb < 2 * *kl + *ku + 1)
+ *info = -6;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CGBTRF", &minfo);
+ return;
+ }
+
+ // Constant
+ const float ZERO[] = { 0., 0. };
+
+ // Result upper band width
+ const int kv = *ku + *kl;
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ float *const A = Ab + 2 * kv;
+
+ // Zero upper diagonal fill-in elements
+ int i, j;
+ for (j = 0; j < *n; j++) {
+ float *const A_j = A + 2 * *ldA * j;
+ for (i = MAX(0, j - kv); i < j - *ku; i++)
+ A_j[2 * i] = A_j[2 * i + 1] = 0.;
+ }
+
+ // Allocate work space
+ const int n1 = CREC_SPLIT(*n);
+ const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+ const int nWorkl = (kv > n1) ? n1 : kv;
+ const int mWorku = (*kl > n1) ? n1 : *kl;
+ const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+ float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
+ float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
+ LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+ LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+ // Recursive kernel
+ RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+ // Free work space
+ free(Workl);
+ free(Worku);
+}
+
+
+/** cgbtrf's recursive compute kernel */
+static void RELAPACK_cgbtrf_rec(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
+ // Unblocked
+ LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ // Loop iterators
+ int i, j;
+
+ // Output upper band width
+ const int kv = *ku + *kl;
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ float *const A = Ab + 2 * kv;
+
+ // Splitting
+ const int n1 = MIN(CREC_SPLIT(*n), *kl);
+ const int n2 = *n - n1;
+ const int m1 = MIN(n1, *m);
+ const int m2 = *m - m1;
+ const int mn1 = MIN(m1, n1);
+ const int mn2 = MIN(m2, n2);
+
+ // Ab_L *
+ // Ab_BR
+ float *const Ab_L = Ab;
+ float *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+ // A_L A_R
+ float *const A_L = A;
+ float *const A_R = A + 2 * *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + 2 * *ldA * n1;
+ float *const A_BL = A + 2 * m1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // Banded splitting
+ const int n21 = MIN(n2, kv - n1);
+ const int n22 = MIN(n2 - n21, n1);
+ const int m21 = MIN(m2, *kl - m1);
+ const int m22 = MIN(m2 - m21, m1);
+
+ // n1 n21 n22
+ // m * A_Rl ARr
+ float *const A_Rl = A_R;
+ float *const A_Rr = A_R + 2 * *ldA * n21;
+
+ // n1 n21 n22
+ // m1 * A_TRl A_TRr
+ // m21 A_BLt A_BRtl A_BRtr
+ // m22 A_BLb A_BRbl A_BRbr
+ float *const A_TRl = A_TR;
+ float *const A_TRr = A_TR + 2 * *ldA * n21;
+ float *const A_BLt = A_BL;
+ float *const A_BLb = A_BL + 2 * m21;
+ float *const A_BRtl = A_BR;
+ float *const A_BRtr = A_BR + 2 * *ldA * n21;
+ float *const A_BRbl = A_BR + 2 * m21;
+ float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
+
+ // recursion(Ab_L, ipiv_T)
+ RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+ // Workl = A_BLb
+ LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+ // partially redo swaps in A_L
+ for (i = 0; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+ else
+ BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+ }
+ }
+
+ // apply pivots to A_Rl
+ LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+ // apply pivots to A_Rr columnwise
+ for (j = 0; j < n22; j++) {
+ float *const A_Rrj = A_Rr + 2 * *ldA * j;
+ for (i = j; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ const float tmpr = A_Rrj[2 * i];
+ const float tmpc = A_Rrj[2 * i + 1];
+ A_Rrj[2 * i] = A_Rrj[2 * ip];
+ A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
+ A_Rrj[2 * ip] = tmpr;
+ A_Rrj[2 * ip + 1] = tmpc;
+ }
+ }
+ }
+
+ // A_TRl = A_TL \ A_TRl
+ BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // Worku = A_TRr
+ LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+ // Worku = A_TL \ Worku
+ BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+ // A_TRr = Worku
+ LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_TRl
+ BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // A_BRbl = A_BRbl - Workl * A_TRl
+ BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+ // A_BRtr = A_BRtr - A_BLt * Worku
+ BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Workl * Worku
+ BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+ // partially undo swaps in A_L
+ for (i = mn1 - 1; i >= 0; i--) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+ else
+ BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+ }
+ }
+
+ // recursion(Ab_BR, ipiv_B)
+ RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ if (*info)
+ *info += n1;
+ // shift pivots
+ for (i = 0; i < mn2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c
new file mode 100644
index 000000000..28e2b00b0
--- /dev/null
+++ b/relapack/src/cgemmt.c
@@ -0,0 +1,167 @@
+#include "relapack.h"
+
+static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
+ const int *, const int *, const float *, const float *, const int *,
+ const float *, const int *, const float *, float *, const int *);
+
+static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
+ const int *, const int *, const float *, const float *, const int *,
+ const float *, const int *, const float *, float *, const int *);
+
+
+/** CGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_cgemmt(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+ BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+#else
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int notransA = LAPACK(lsame)(transA, "N");
+ const int tranA = LAPACK(lsame)(transA, "T");
+ const int ctransA = LAPACK(lsame)(transA, "C");
+ const int notransB = LAPACK(lsame)(transB, "N");
+ const int tranB = LAPACK(lsame)(transB, "T");
+ const int ctransB = LAPACK(lsame)(transB, "C");
+ int info = 0;
+ if (!lower && !upper)
+ info = 1;
+ else if (!tranA && !ctransA && !notransA)
+ info = 2;
+ else if (!tranB && !ctransB && !notransB)
+ info = 3;
+ else if (*n < 0)
+ info = 4;
+ else if (*k < 0)
+ info = 5;
+ else if (*ldA < MAX(1, notransA ? *n : *k))
+ info = 8;
+ else if (*ldB < MAX(1, notransB ? *k : *n))
+ info = 10;
+ else if (*ldC < MAX(1, *n))
+ info = 13;
+ if (info) {
+ LAPACK(xerbla)("CGEMMT", &info);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
+ const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
+
+ // Recursive kernel
+ RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** cgemmt's recursive compute kernel */
+static void RELAPACK_cgemmt_rec(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+
+ if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
+ // Unblocked
+ RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+ }
+
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_T
+ // A_B
+ const float *const A_T = A;
+ const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
+
+ // B_L B_R
+ const float *const B_L = B;
+ const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
+
+ // C_TL C_TR
+ // C_BL C_BR
+ float *const C_TL = C;
+ float *const C_TR = C + 2 * *ldC * n1;
+ float *const C_BL = C + 2 * n1;
+ float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
+
+ // recursion(C_TL)
+ RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+ if (*uplo == 'L')
+ // C_BL = alpha A_B B_L + beta C_BL
+ BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+ else
+ // C_TR = alpha A_T B_R + beta C_TR
+ BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+ // recursion(C_BR)
+ RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** cgemmt's unblocked compute kernel */
+static void RELAPACK_cgemmt_rec2(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+
+ const int incB = (*transB == 'N') ? 1 : *ldB;
+ const int incC = 1;
+
+ int i;
+ for (i = 0; i < *n; i++) {
+ // A_0
+ // A_i
+ const float *const A_0 = A;
+ const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
+
+ // * B_i *
+ const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
+
+ // * C_0i *
+ // * C_ii *
+ float *const C_0i = C + 2 * *ldC * i;
+ float *const C_ii = C + 2 * *ldC * i + 2 * i;
+
+ if (*uplo == 'L') {
+ const int nmi = *n - i;
+ if (*transA == 'N')
+ BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ else
+ BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ } else {
+ const int ip1 = i + 1;
+ if (*transA == 'N')
+ BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ else
+ BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ }
+ }
+}
diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c
new file mode 100644
index 000000000..b31a711d0
--- /dev/null
+++ b/relapack/src/cgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
+ const int *, int *, int *);
+
+
+/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's cgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
+ */
+void RELAPACK_cgetrf(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CGETRF", &minfo);
+ return;
+ }
+
+ const int sn = MIN(*m, *n);
+
+ RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+ // Right remainder
+ if (*m < *n) {
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const int iONE[] = { 1 };
+
+ // Splitting
+ const int rn = *n - *m;
+
+ // A_L A_R
+ const float *const A_L = A;
+ float *const A_R = A + 2 * *ldA * *m;
+
+ // A_R = apply(ipiv, A_R)
+ LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+ // A_R = A_L \ A_R
+ BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+ }
+}
+
+
+/** cgetrf's recursive compute kernel */
+static void RELAPACK_cgetrf_rec(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
+ // Unblocked
+ LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+ const int m2 = *m - n1;
+
+ // A_L A_R
+ float *const A_L = A;
+ float *const A_R = A + 2 * *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + 2 * *ldA * n1;
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // recursion(A_L, ipiv_T)
+ RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+ // apply pivots to A_R
+ LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+ // A_TR = A_TL \ A_TR
+ BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_BL * A_TR
+ BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+ // recursion(A_BR, ipiv_B)
+ RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+ if (*info)
+ *info += n1;
+ // apply pivots to A_BL
+ LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c
new file mode 100644
index 000000000..dff875017
--- /dev/null
+++ b/relapack/src/chegst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_chegst_rec(const int *, const char *, const int *,
+ float *, const int *, const float *, const int *,
+ float *, const int *, int *);
+
+
+/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's chegst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
+ * */
+void RELAPACK_chegst(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (*itype < 1 || *itype > 3)
+ *info = -1;
+ else if (!lower && !upper)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ else if (*ldB < MAX(1, *n))
+ *info = -7;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CHEGST", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Allocate work space
+ float *Work = NULL;
+ int lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+ const int n1 = CREC_SPLIT(*n);
+ lWork = n1 * (*n - n1);
+ Work = malloc(lWork * 2 * sizeof(float));
+ if (!Work)
+ lWork = 0;
+#endif
+
+ // recursive kernel
+ RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+ // Free work space
+#if XSYGST_ALLOW_MALLOC
+ if (Work)
+ free(Work);
+#endif
+}
+
+
+/** chegst's recursive compute kernel */
+static void RELAPACK_chegst_rec(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ float *Work, const int *lWork, int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
+ // Unblocked
+ LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
+ return;
+ }
+
+ // Constants
+ const float ZERO[] = { 0., 0. };
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const float HALF[] = { .5, 0. };
+ const float MHALF[] = { -.5, 0. };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + 2 * *ldA * n1;
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // B_TL B_TR
+ // B_BL B_BR
+ const float *const B_TL = B;
+ const float *const B_TR = B + 2 * *ldB * n1;
+ const float *const B_BL = B + 2 * n1;
+ const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+ // recursion(A_TL, B_TL)
+ RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+ if (*itype == 1)
+ if (*uplo == 'L') {
+ // A_BL = A_BL / B_TL'
+ BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork > n2 * n1) {
+ // T = -1/2 * B_BL * A_TL
+ BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ } else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+ BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+ if (*lWork > n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR \ A_BL
+ BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL' \ A_TR
+ BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork > n2 * n1) {
+ // T = -1/2 * A_TL * B_TR
+ BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ } else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+ BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+ if (*lWork > n2 * n1)
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR / B_BR
+ BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+ else
+ if (*uplo == 'L') {
+ // A_BL = A_BL * B_TL
+ BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork > n2 * n1) {
+ // T = 1/2 * A_BR * B_BL
+ BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ } else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+ BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+ if (*lWork > n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR * A_BL
+ BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL * A_TR
+ BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork > n2 * n1) {
+ // T = 1/2 * B_TR * A_BR
+ BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ } else
+ // A_TR = A_TR + 1/2 B_TR A_BR
+ BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+ BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+ if (*lWork > n2 * n1)
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ else
+ // A_TR = A_TR + 1/2 B_TR * A_BR
+ BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR * B_BR
+ BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+
+ // recursion(A_BR, B_BR)
+ RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c
new file mode 100644
index 000000000..2928235e4
--- /dev/null
+++ b/relapack/src/chetrf.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
+ float *, const int *, int *, float *, const int *, int *);
+
+
+/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's chetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
+ * */
+void RELAPACK_chetrf(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CHETRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** chetrf's recursive compute kernel */
+static void RELAPACK_chetrf_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = CREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ float *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ float *const A_BL_B = A + 2 * *n;
+ float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ float *const Work_BL = Work + 2 * n1;
+ float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ float *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = CREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ float *const A_TL_T = A + 2 * *ldA * n_rest;
+ float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ float *const Work_L = Work;
+ float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c
new file mode 100644
index 000000000..b5c8341b6
--- /dev/null
+++ b/relapack/src/chetrf_rec2.c
@@ -0,0 +1,520 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's clahef.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
+ nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
+ int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ float r__1, r__2, r__3, r__4;
+ complex q__1, q__2, q__3, q__4;
+
+ /* Builtin functions */
+ double sqrt(double), r_imag(complex *);
+ void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
+
+ /* Local variables */
+ static int j, k;
+ static float t, r1;
+ static complex d11, d21, d22;
+ static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+ static float alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+ , complex *, int *, complex *, int *, complex *, complex *
+ , int *, ftnlen), ccopy_(int *, complex *, int *,
+ complex *, int *), cswap_(int *, complex *, int *,
+ complex *, int *);
+ static int kstep;
+ static float absakk;
+ extern /* Subroutine */ int clacgv_(int *, complex *, int *);
+ extern int icamax_(int *, complex *, int *);
+ extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+ *);
+ static float colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.f) + 1.f) / 8.f;
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ i__1 = k - 1;
+ ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = k + kw * w_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ i__1 = k + kw * w_dim1;
+ i__2 = k + kw * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ }
+ i__1 = k + kw * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + kw * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - 1;
+ ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ i__1 = k - imax;
+ ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ i__1 = k - imax;
+ clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + (kw - 1) * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ }
+ i__1 = k - imax;
+ jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+ &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ jmax + (kw - 1) * w_dim1]), dabs(r__2));
+ if (imax > 1) {
+ i__1 = imax - 1;
+ jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (kw - 1) * w_dim1;
+ r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+ r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
+ r__2));
+ rowmax = dmax(r__3,r__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (kw - 1) * w_dim1;
+ if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
+ kp = imax;
+ ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ i__1 = kk - 1 - kp;
+ ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ i__1 = kk - 1 - kp;
+ clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ i__1 = k + k * a_dim1;
+ r1 = 1.f / a[i__1].r;
+ i__1 = k - 1;
+ csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ i__1 = k - 1;
+ clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ }
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ r_cnjg(&q__2, &d21);
+ c_div(&q__1, &w[k + kw * w_dim1], &q__2);
+ d11.r = q__1.r, d11.i = q__1.i;
+ c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1.f / (q__1.r - 1.f);
+ q__2.r = t, q__2.i = 0.f;
+ c_div(&q__1, &q__2, &d21);
+ d21.r = q__1.r, d21.i = q__1.i;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+ .i;
+ q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+ d21.r * q__2.i + d21.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + k * a_dim1;
+ r_cnjg(&q__2, &d21);
+ i__3 = j + kw * w_dim1;
+ q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+ q__2.r * q__3.i + q__2.i * q__3.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1;
+ clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = k - 2;
+ clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ ++j;
+ }
+ ++j;
+ if (jp != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+ }
+ if (j <= *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ if (k < *n) {
+ i__1 = *n - k;
+ ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+ w_dim1], &c__1);
+ }
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ i__1 = k + k * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + k * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - k;
+ ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = imax - k;
+ clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ if (imax < *n) {
+ i__1 = *n - imax;
+ ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+ imax + 1 + (k + 1) * w_dim1], &c__1);
+ }
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
+ lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+ w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + (k + 1) * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ i__1 = imax - k;
+ jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+ ;
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ jmax + (k + 1) * w_dim1]), dabs(r__2));
+ if (imax < *n) {
+ i__1 = *n - imax;
+ jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (k + 1) * w_dim1;
+ r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+ r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
+ r__2));
+ rowmax = dmax(r__3,r__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (k + 1) * w_dim1;
+ if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+ k * w_dim1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k + kstep - 1;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ i__1 = kp - kk - 1;
+ ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ i__1 = kp - kk - 1;
+ clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ i__1 = k + k * a_dim1;
+ r1 = 1.f / a[i__1].r;
+ i__1 = *n - k;
+ csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ i__1 = *n - k;
+ clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = q__1.r, d11.i = q__1.i;
+ r_cnjg(&q__2, &d21);
+ c_div(&q__1, &w[k + k * w_dim1], &q__2);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1.f / (q__1.r - 1.f);
+ q__2.r = t, q__2.i = 0.f;
+ c_div(&q__1, &q__2, &d21);
+ d21.r = q__1.r, d21.i = q__1.i;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ r_cnjg(&q__2, &d21);
+ i__3 = j + k * w_dim1;
+ q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
+ q__2.r * q__3.i + q__2.i * q__3.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+ .i;
+ q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+ d21.r * q__2.i + d21.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = *n - k;
+ clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = *n - k - 1;
+ clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ --j;
+ }
+ --j;
+ if (jp != jj && j >= 1) {
+ cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j >= 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c
new file mode 100644
index 000000000..086393d57
--- /dev/null
+++ b/relapack/src/chetrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
+ float *, const int *, int *, float *, const int *, int *);
+
+
+/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's chetrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
+ * */
+void RELAPACK_chetrf_rook(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CHETRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** chetrf_rook's recursive compute kernel */
+static void RELAPACK_chetrf_rook_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = CREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ float *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ float *const A_BL_B = A + 2 * *n;
+ float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ float *const Work_BL = Work + 2 * n1;
+ float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ float *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = CREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ float *const A_TL_T = A + 2 * *ldA * n_rest;
+ float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ float *const Work_L = Work;
+ float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c
new file mode 100644
index 000000000..a42cbfd44
--- /dev/null
+++ b/relapack/src/chetrf_rook_rec2.c
@@ -0,0 +1,661 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's clahef_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
+ int *nb, int *kb, complex *a, int *lda, int *ipiv,
+ complex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ float r__1, r__2;
+ complex q__1, q__2, q__3, q__4, q__5;
+
+ /* Builtin functions */
+ double sqrt(double), r_imag(complex *);
+ void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);
+
+ /* Local variables */
+ static int j, k, p;
+ static float t, r1;
+ static complex d11, d21, d22;
+ static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+ static logical done;
+ static int imax, jmax;
+ static float alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+ , complex *, int *, complex *, int *, complex *, complex *
+ , int *, ftnlen);
+ static float sfmin;
+ extern /* Subroutine */ int ccopy_(int *, complex *, int *,
+ complex *, int *);
+ static int itemp;
+ extern /* Subroutine */ int cswap_(int *, complex *, int *,
+ complex *, int *);
+ static int kstep;
+ static float stemp, absakk;
+ extern /* Subroutine */ int clacgv_(int *, complex *, int *);
+ extern int icamax_(int *, complex *, int *);
+ extern double slamch_(char *, ftnlen);
+ extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+ *);
+ static float colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.f) + 1.f) / 8.f;
+ sfmin = slamch_("S", (ftnlen)1);
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ p = k;
+ if (k > 1) {
+ i__1 = k - 1;
+ ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
+ c__1);
+ }
+ i__1 = k + kw * w_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ i__1 = k + kw * w_dim1;
+ i__2 = k + kw * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ }
+ i__1 = k + kw * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + kw * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ r__1 = w[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ if (k > 1) {
+ i__1 = k - 1;
+ ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
+ &c__1);
+ }
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L12:
+ if (imax > 1) {
+ i__1 = imax - 1;
+ ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ }
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ i__1 = k - imax;
+ ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ i__1 = k - imax;
+ clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + (kw - 1) * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ }
+ if (imax != k) {
+ i__1 = k - imax;
+ jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
+ w_dim1], &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
+ } else {
+ rowmax = 0.f;
+ }
+ if (imax > 1) {
+ i__1 = imax - 1;
+ itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ i__1 = itemp + (kw - 1) * w_dim1;
+ stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
+ if (stemp > rowmax) {
+ rowmax = stemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (kw - 1) * w_dim1;
+ if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
+ kp = imax;
+ ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ }
+ if (! done) {
+ goto L12;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kstep == 2 && p != k) {
+ i__1 = p + p * a_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ i__1 = k - 1 - p;
+ ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+ a_dim1], lda);
+ i__1 = k - 1 - p;
+ clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
+ if (p > 1) {
+ i__1 = p - 1;
+ ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
+ 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
+ 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+ ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ i__1 = kk - 1 - kp;
+ ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ i__1 = kk - 1 - kp;
+ clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ i__1 = k + k * a_dim1;
+ t = a[i__1].r;
+ if (dabs(t) >= sfmin) {
+ r1 = 1.f / t;
+ i__1 = k - 1;
+ csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ i__1 = k - 1;
+ for (ii = 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ i__3 = ii + k * a_dim1;
+ q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L14: */
+ }
+ }
+ i__1 = k - 1;
+ clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ }
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ r_cnjg(&q__2, &d21);
+ c_div(&q__1, &w[k + kw * w_dim1], &q__2);
+ d11.r = q__1.r, d11.i = q__1.i;
+ c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1.f / (q__1.r - 1.f);
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ c_div(&q__2, &q__3, &d21);
+ q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + k * a_dim1;
+ i__3 = j + kw * w_dim1;
+ q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ r_cnjg(&q__5, &d21);
+ c_div(&q__2, &q__3, &q__5);
+ q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1;
+ clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = k - 2;
+ clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ ++j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ ++j;
+ if (jp2 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ ++jj;
+ if (kstep == 2 && jp1 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ if (j < *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ p = k;
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ if (k < *n) {
+ i__1 = *n - k;
+ ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+ w_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
+ w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+ ftnlen)12);
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ }
+ i__1 = k + k * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + k * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ r__1 = w[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ if (k < *n) {
+ i__1 = *n - k;
+ ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
+ a_dim1], &c__1);
+ }
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L72:
+ i__1 = imax - k;
+ ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = imax - k;
+ clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ r__1 = a[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ if (imax < *n) {
+ i__1 = *n - imax;
+ ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+ imax + 1 + (k + 1) * w_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
+ , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+ 1) * w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + (k + 1) * w_dim1;
+ r__1 = w[i__2].r;
+ w[i__1].r = r__1, w[i__1].i = 0.f;
+ }
+ if (imax != k) {
+ i__1 = imax - k;
+ jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+ c__1);
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[jmax + (k + 1) * w_dim1]), dabs(r__2));
+ } else {
+ rowmax = 0.f;
+ }
+ if (imax < *n) {
+ i__1 = *n - imax;
+ itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = itemp + (k + 1) * w_dim1;
+ stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[itemp + (k + 1) * w_dim1]), dabs(r__2));
+ if (stemp > rowmax) {
+ rowmax = stemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (k + 1) * w_dim1;
+ if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ }
+ if (! done) {
+ goto L72;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kstep == 2 && p != k) {
+ i__1 = p + p * a_dim1;
+ i__2 = k + k * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ i__1 = p - k - 1;
+ ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
+ a_dim1], lda);
+ i__1 = p - k - 1;
+ clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
+ if (p < *n) {
+ i__1 = *n - p;
+ ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
+ * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+ }
+ cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ r__1 = a[i__2].r;
+ a[i__1].r = r__1, a[i__1].i = 0.f;
+ i__1 = kp - kk - 1;
+ ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ i__1 = kp - kk - 1;
+ clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ i__1 = k + k * a_dim1;
+ t = a[i__1].r;
+ if (dabs(t) >= sfmin) {
+ r1 = 1.f / t;
+ i__1 = *n - k;
+ csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ } else {
+ i__1 = *n;
+ for (ii = k + 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ i__3 = ii + k * a_dim1;
+ q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L74: */
+ }
+ }
+ i__1 = *n - k;
+ clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = q__1.r, d11.i = q__1.i;
+ r_cnjg(&q__2, &d21);
+ c_div(&q__1, &w[k + k * w_dim1], &q__2);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1.f / (q__1.r - 1.f);
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ i__3 = j + k * w_dim1;
+ q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ r_cnjg(&q__5, &d21);
+ c_div(&q__2, &q__3, &q__5);
+ q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ c_div(&q__2, &q__3, &d21);
+ q__1.r = t * q__2.r, q__1.i = t * q__2.i;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = *n - k;
+ clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = *n - k - 1;
+ clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ --j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ --j;
+ if (jp2 != jj && j >= 1) {
+ cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ --jj;
+ if (kstep == 2 && jp1 != jj && j >= 1) {
+ cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j > 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c
new file mode 100644
index 000000000..36d6297cf
--- /dev/null
+++ b/relapack/src/clauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_clauum_rec(const char *, const int *, float *,
+ const int *, int *);
+
+
+/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's clauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
+ * */
+void RELAPACK_clauum(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CLAUUM", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** clauum's recursive compute kernel */
+static void RELAPACK_clauum_rec(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
+ // Unblocked
+ LAPACK(clauu2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + 2 * *ldA * n1;
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);
+
+ if (*uplo == 'L') {
+ // A_TL = A_TL + A_BL' * A_BL
+ BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+ // A_BL = A_BR' * A_BL
+ BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TL = A_TL + A_TR * A_TR'
+ BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+ // A_TR = A_TR * A_BR'
+ BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c
new file mode 100644
index 000000000..e0ea7b944
--- /dev/null
+++ b/relapack/src/cpbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
+ float *, const int *, float *, const int *, int *);
+
+
+/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's cpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
+ * */
+void RELAPACK_cpbtrf(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kd < 0)
+ *info = -3;
+ else if (*ldAb < *kd + 1)
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CPBTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Constant
+ const float ZERO[] = { 0., 0. };
+
+ // Allocate work space
+ const int n1 = CREC_SPLIT(*n);
+ const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+ const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+ float *Work = malloc(mWork * nWork * 2 * sizeof(float));
+ LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+ // Recursive kernel
+ RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+ // Free work space
+ free(Work);
+}
+
+
+/** cpbtrf's recursive compute kernel */
+static void RELAPACK_cpbtrf_rec(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ float *Work, const int *ldWork,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
+ // Unblocked
+ LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
+
+ // Splitting
+ const int n1 = MIN(CREC_SPLIT(*n), *kd);
+ const int n2 = *n - n1;
+
+ // * *
+ // * Ab_BR
+ float *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + 2 * *ldA * n1;
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ // Banded splitting
+ const int n21 = MIN(n2, *kd - n1);
+ const int n22 = MIN(n2 - n21, *kd);
+
+ // n1 n21 n22
+ // n1 * A_TRl A_TRr
+ // n21 A_BLt A_BRtl A_BRtr
+ // n22 A_BLb A_BRbl A_BRbr
+ float *const A_TRl = A_TR;
+ float *const A_TRr = A_TR + 2 * *ldA * n21;
+ float *const A_BLt = A_BL;
+ float *const A_BLb = A_BL + 2 * n21;
+ float *const A_BRtl = A_BR;
+ float *const A_BRtr = A_BR + 2 * *ldA * n21;
+ float *const A_BRbl = A_BR + 2 * n21;
+ float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21;
+
+ if (*uplo == 'L') {
+ // A_BLt = ABLt / A_TL'
+ BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_BLt'
+ BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+ // Work = A_BLb
+ LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+ // Work = Work / A_TL'
+ BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRbl = A_BRbl - Work * A_BLt'
+ BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+ // A_BRbr = A_BRbr - Work * Work'
+ BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_BLb = Work
+ LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+ } else {
+ // A_TRl = A_TL' \ A_TRl
+ BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // A_BRtl = A_BRtl - A_TRl' * A_TRl
+ BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // Work = A_TRr
+ LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+ // Work = A_TL' \ Work
+ BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRtr = A_BRtr - A_TRl' * Work
+ BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Work' * Work
+ BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_TRr = Work
+ LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+ }
+
+ // recursion(A_BR)
+ if (*kd > n1)
+ RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
+ else
+ RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c
new file mode 100644
index 000000000..e35caa7fa
--- /dev/null
+++ b/relapack/src/cpotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
+ const int *, int *);
+
+
+/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's cpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
+ * */
+void RELAPACK_cpotrf(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CPOTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** cpotrf's recursive compute kernel */
+static void RELAPACK_cpotrf_rec(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
+ // Unblocked
+ LAPACK(cpotf2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + 2 * *ldA * n1;
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = A_BL / A_TL'
+ BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+ // A_BR = A_BR - A_BL * A_BL'
+ BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+ } else {
+ // A_TR = A_TL' \ A_TR
+ BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * A_TR
+ BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c
new file mode 100644
index 000000000..01c161d1a
--- /dev/null
+++ b/relapack/src/csytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
+ float *, const int *, int *, float *, const int *, int *);
+
+
+/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's csytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
+ * */
+void RELAPACK_csytrf(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy arguments
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** csytrf's recursive compute kernel */
+static void RELAPACK_csytrf_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = CREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ float *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ float *const A_BL_B = A + 2 * *n;
+ float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ float *const Work_BL = Work + 2 * n1;
+ float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ float *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = CREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ float *const A_TL_T = A + 2 * *ldA * n_rest;
+ float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ float *const Work_L = Work;
+ float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c
new file mode 100644
index 000000000..9d6bd849d
--- /dev/null
+++ b/relapack/src/csytrf_rec2.c
@@ -0,0 +1,451 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's clasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
+ nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
+ int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ float r__1, r__2, r__3, r__4;
+ complex q__1, q__2, q__3;
+
+ /* Builtin functions */
+ double sqrt(double), r_imag(complex *);
+ void c_div(complex *, complex *, complex *);
+
+ /* Local variables */
+ static int j, k;
+ static complex t, r1, d11, d21, d22;
+ static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+ static float alpha;
+ extern /* Subroutine */ int cscal_(int *, complex *, complex *,
+ int *);
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+ , complex *, int *, complex *, int *, complex *, complex *
+ , int *, ftnlen), ccopy_(int *, complex *, int *,
+ complex *, int *), cswap_(int *, complex *, int *,
+ complex *, int *);
+ static int kstep;
+ static float absakk;
+ extern int icamax_(int *, complex *, int *);
+ static float colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.f) + 1.f) / 8.f;
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ kstep = 1;
+ i__1 = k + kw * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
+ w_dim1]), dabs(r__2));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + kw * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ i__1 = k - imax;
+ jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+ &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ jmax + (kw - 1) * w_dim1]), dabs(r__2));
+ if (imax > 1) {
+ i__1 = imax - 1;
+ jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (kw - 1) * w_dim1;
+ r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+ r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
+ r__2));
+ rowmax = dmax(r__3,r__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (kw - 1) * w_dim1;
+ if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
+ rowmax) {
+ kp = imax;
+ ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = kk - 1 - kp;
+ ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = q__1.r, r1.i = q__1.i;
+ i__1 = k - 1;
+ cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ c_div(&q__1, &w[k + kw * w_dim1], &d21);
+ d11.r = q__1.r, d11.i = q__1.i;
+ c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+ c_div(&q__1, &c_b1, &q__2);
+ t.r = q__1.r, t.i = q__1.i;
+ c_div(&q__1, &t, &d21);
+ d21.r = q__1.r, d21.i = q__1.i;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+ .i;
+ q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+ d21.r * q__2.i + d21.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + k * a_dim1;
+ i__3 = j + kw * w_dim1;
+ q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+ .i;
+ q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+ d21.r * q__2.i + d21.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ ++j;
+ }
+ ++j;
+ if (jp != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+ }
+ if (j < *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
+ + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+ kstep = 1;
+ i__1 = k + k * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
+ w_dim1]), dabs(r__2));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + k * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - k;
+ ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
+ lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+ w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax - k;
+ jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+ ;
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ jmax + (k + 1) * w_dim1]), dabs(r__2));
+ if (imax < *n) {
+ i__1 = *n - imax;
+ jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (k + 1) * w_dim1;
+ r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
+ r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
+ r__2));
+ rowmax = dmax(r__3,r__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (k + 1) * w_dim1;
+ if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
+ rowmax) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+ k * w_dim1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k + kstep - 1;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = kp - kk - 1;
+ ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = q__1.r, r1.i = q__1.i;
+ i__1 = *n - k;
+ cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = q__1.r, d11.i = q__1.i;
+ c_div(&q__1, &w[k + k * w_dim1], &d21);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+ c_div(&q__1, &c_b1, &q__2);
+ t.r = q__1.r, t.i = q__1.i;
+ c_div(&q__1, &t, &d21);
+ d21.r = q__1.r, d21.i = q__1.i;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ i__3 = j + k * w_dim1;
+ q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+ .i;
+ q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+ d21.r * q__2.i + d21.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
+ .i;
+ q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
+ d21.r * q__2.i + d21.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ --j;
+ }
+ --j;
+ if (jp != jj && j >= 1) {
+ cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j > 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c
new file mode 100644
index 000000000..aa7dd0e57
--- /dev/null
+++ b/relapack/src/csytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
+ float *, const int *, int *, float *, const int *, int *);
+
+
+/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's csytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
+ * */
+void RELAPACK_csytrf_rook(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(float));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** csytrf_rook's recursive compute kernel */
+static void RELAPACK_csytrf_rook_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = CREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ float *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ float *const A_BL_B = A + 2 * *n;
+ float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ float *const Work_BL = Work + 2 * n1;
+ float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ float *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ float *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = CREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ float *const A_TL_T = A + 2 * *ldA * n_rest;
+ float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ float *const Work_L = Work;
+ float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c
new file mode 100644
index 000000000..6638338a6
--- /dev/null
+++ b/relapack/src/csytrf_rook_rec2.c
@@ -0,0 +1,565 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static int c__1 = 1;
+
+/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's clasyf_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
+ int *nb, int *kb, complex *a, int *lda, int *ipiv,
+ complex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ float r__1, r__2;
+ complex q__1, q__2, q__3, q__4;
+
+ /* Builtin functions */
+ double sqrt(double), r_imag(complex *);
+ void c_div(complex *, complex *, complex *);
+
+ /* Local variables */
+ static int j, k, p;
+ static complex t, r1, d11, d12, d21, d22;
+ static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+ static logical done;
+ static int imax, jmax;
+ static float alpha;
+ extern /* Subroutine */ int cscal_(int *, complex *, complex *,
+ int *);
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
+ , complex *, int *, complex *, int *, complex *, complex *
+ , int *, ftnlen);
+ static float sfmin;
+ extern /* Subroutine */ int ccopy_(int *, complex *, int *,
+ complex *, int *);
+ static int itemp;
+ extern /* Subroutine */ int cswap_(int *, complex *, int *,
+ complex *, int *);
+ static int kstep;
+ static float stemp, absakk;
+ extern int icamax_(int *, complex *, int *);
+ extern double slamch_(char *, ftnlen);
+ static float colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.f) + 1.f) / 8.f;
+ sfmin = slamch_("S", (ftnlen)1);
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ p = k;
+ ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ i__1 = k + kw * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
+ w_dim1]), dabs(r__2));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + kw * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L12:
+ ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = k - imax;
+ jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
+ w_dim1], &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
+ } else {
+ rowmax = 0.f;
+ }
+ if (imax > 1) {
+ i__1 = imax - 1;
+ itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ i__1 = itemp + (kw - 1) * w_dim1;
+ stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
+ if (stemp > rowmax) {
+ rowmax = stemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (kw - 1) * w_dim1;
+ if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
+ rowmax)) {
+ kp = imax;
+ ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ }
+ if (! done) {
+ goto L12;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kstep == 2 && p != k) {
+ i__1 = k - p;
+ ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+ a_dim1], lda);
+ ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - k + 1;
+ cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+ ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + k * a_dim1;
+ i__2 = kk + k * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = k - 1 - kp;
+ ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - kk + 1;
+ cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ i__1 = k + k * a_dim1;
+ if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ + k * a_dim1]), dabs(r__2)) >= sfmin) {
+ c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = q__1.r, r1.i = q__1.i;
+ i__1 = k - 1;
+ cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else /* if(complicated condition) */ {
+ i__1 = k + k * a_dim1;
+ if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
+ i__1 = k - 1;
+ for (ii = 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
+ a_dim1]);
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L14: */
+ }
+ }
+ }
+ }
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d12.r = w[i__1].r, d12.i = w[i__1].i;
+ c_div(&q__1, &w[k + kw * w_dim1], &d12);
+ d11.r = q__1.r, d11.i = q__1.i;
+ c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+ c_div(&q__1, &c_b1, &q__2);
+ t.r = q__1.r, t.i = q__1.i;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ c_div(&q__2, &q__3, &d12);
+ q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+ q__2.i + t.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + k * a_dim1;
+ i__3 = j + kw * w_dim1;
+ q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ c_div(&q__2, &q__3, &d12);
+ q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+ q__2.i + t.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ ++j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ ++j;
+ if (jp2 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ jj = j - 1;
+ if (jp1 != jj && kstep == 2) {
+ i__1 = *n - j + 1;
+ cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ if (j <= *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ p = k;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
+ w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+ ftnlen)12);
+ }
+ i__1 = k + k * w_dim1;
+ absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
+ w_dim1]), dabs(r__2));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
+ + k * w_dim1]), dabs(r__2));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L72:
+ i__1 = imax - k;
+ ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ q__1.r = -1.f, q__1.i = -0.f;
+ cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
+ , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+ 1) * w_dim1], &c__1, (ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = imax - k;
+ jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+ c__1);
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[jmax + (k + 1) * w_dim1]), dabs(r__2));
+ } else {
+ rowmax = 0.f;
+ }
+ if (imax < *n) {
+ i__1 = *n - imax;
+ itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = itemp + (k + 1) * w_dim1;
+ stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
+ w[itemp + (k + 1) * w_dim1]), dabs(r__2));
+ if (stemp > rowmax) {
+ rowmax = stemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (k + 1) * w_dim1;
+ if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
+ imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
+ rowmax)) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ }
+ if (! done) {
+ goto L72;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kstep == 2 && p != k) {
+ i__1 = p - k;
+ ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - p + 1;
+ ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+ c__1);
+ cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+ cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + k * a_dim1;
+ i__2 = kk + k * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = kp - k - 1;
+ ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+ * a_dim1], lda);
+ i__1 = *n - kp + 1;
+ ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+ a_dim1], &c__1);
+ cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ i__1 = k + k * a_dim1;
+ if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
+ + k * a_dim1]), dabs(r__2)) >= sfmin) {
+ c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = q__1.r, r1.i = q__1.i;
+ i__1 = *n - k;
+ cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ } else /* if(complicated condition) */ {
+ i__1 = k + k * a_dim1;
+ if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
+ i__1 = *n;
+ for (ii = k + 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
+ a_dim1]);
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L74: */
+ }
+ }
+ }
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = q__1.r, d11.i = q__1.i;
+ c_div(&q__1, &w[k + k * w_dim1], &d21);
+ d22.r = q__1.r, d22.i = q__1.i;
+ q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
+ c_div(&q__1, &c_b1, &q__2);
+ t.r = q__1.r, t.i = q__1.i;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ i__3 = j + k * w_dim1;
+ q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ c_div(&q__2, &q__3, &d21);
+ q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+ q__2.i + t.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
+ .i;
+ c_div(&q__2, &q__3, &d21);
+ q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
+ q__2.i + t.i * q__2.r;
+ a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ --j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ --j;
+ if (jp2 != jj && j >= 1) {
+ cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ jj = j + 1;
+ if (jp1 != jj && kstep == 2) {
+ cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j >= 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c
new file mode 100644
index 000000000..15c738baf
--- /dev/null
+++ b/relapack/src/ctgsyl.c
@@ -0,0 +1,268 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
+ const int *, const float *, const int *, const float *, const int *,
+ float *, const int *, const float *, const int *, const float *,
+ const int *, float *, const int *, float *, float *, float *, int *);
+
+
+/** CTGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ctgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
+ * */
+void RELAPACK_ctgsyl(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dif,
+ float *Work, const int *lWork, int *iWork, int *info
+) {
+
+ // Parse arguments
+ const int notran = LAPACK(lsame)(trans, "N");
+ const int tran = LAPACK(lsame)(trans, "C");
+
+ // Compute work buffer size
+ int lwmin = 1;
+ if (notran && (*ijob == 1 || *ijob == 2))
+ lwmin = MAX(1, 2 * *m * *n);
+ *info = 0;
+
+ // Check arguments
+ if (!tran && !notran)
+ *info = -1;
+ else if (notran && (*ijob < 0 || *ijob > 4))
+ *info = -2;
+ else if (*m <= 0)
+ *info = -3;
+ else if (*n <= 0)
+ *info = -4;
+ else if (*ldA < MAX(1, *m))
+ *info = -6;
+ else if (*ldB < MAX(1, *n))
+ *info = -8;
+ else if (*ldC < MAX(1, *m))
+ *info = -10;
+ else if (*ldD < MAX(1, *m))
+ *info = -12;
+ else if (*ldE < MAX(1, *n))
+ *info = -14;
+ else if (*ldF < MAX(1, *m))
+ *info = -16;
+ else if (*lWork < lwmin && *lWork != -1)
+ *info = -20;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CTGSYL", &minfo);
+ return;
+ }
+
+ if (*lWork == -1) {
+ // Work size query
+ *Work = lwmin;
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantrans = notran ? 'N' : 'C';
+
+ // Constant
+ const float ZERO[] = { 0., 0. };
+
+ int isolve = 1;
+ int ifunc = 0;
+ if (notran) {
+ if (*ijob >= 3) {
+ ifunc = *ijob - 2;
+ LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else if (*ijob >= 1)
+ isolve = 2;
+ }
+
+ float scale2;
+ int iround;
+ for (iround = 1; iround <= isolve; iround++) {
+ *scale = 1;
+ float dscale = 0;
+ float dsum = 1;
+ RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
+ if (dscale != 0) {
+ if (*ijob == 1 || *ijob == 3)
+ *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+ else
+ *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
+ }
+ if (isolve == 2) {
+ if (iround == 1) {
+ if (notran)
+ ifunc = *ijob;
+ scale2 = *scale;
+ LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
+ LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
+ LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else {
+ LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
+ LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
+ *scale = scale2;
+ }
+ }
+ }
+}
+
+
+/** ctgsyl's recursive vompute kernel */
+static void RELAPACK_ctgsyl_rec(
+ const char *trans, const int *ifunc, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dsum, float *dscale,
+ int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
+ // Unblocked
+ LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ float scale1[] = { 1., 0. };
+ float scale2[] = { 1., 0. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ const int m1 = CREC_SPLIT(*m);
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const float *const A_TL = A;
+ const float *const A_TR = A + 2 * *ldA * m1;
+ const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+ // C_T
+ // C_B
+ float *const C_T = C;
+ float *const C_B = C + 2 * m1;
+
+ // D_TL D_TR
+ // 0 D_BR
+ const float *const D_TL = D;
+ const float *const D_TR = D + 2 * *ldD * m1;
+ const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
+
+ // F_T
+ // F_B
+ float *const F_T = F;
+ float *const F_B = F + 2 * m1;
+
+ if (*trans == 'N') {
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // F_T = F_T - D_TR * C_B
+ BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+ }
+ } else {
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+ // C_B = C_B - A_TR^H * C_T
+ BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // C_B = C_B - D_TR^H * F_T
+ BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+ }
+ }
+ } else {
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const float *const B_TL = B;
+ const float *const B_TR = B + 2 * *ldB * n1;
+ const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+ // C_L C_R
+ float *const C_L = C;
+ float *const C_R = C + 2 * *ldC * n1;
+
+ // E_TL E_TR
+ // 0 E_BR
+ const float *const E_TL = E;
+ const float *const E_TR = E + 2 * *ldE * n1;
+ const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
+
+ // F_L F_R
+ float *const F_L = F;
+ float *const F_R = F + 2 * *ldF * n1;
+
+ if (*trans == 'N') {
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
+ // C_R = C_R + F_L * B_TR
+ BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+ // F_R = F_R + F_L * E_TR
+ BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+ }
+ } else {
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+ // F_L = F_L + C_R * B_TR
+ BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+ // F_L = F_L + F_R * E_TR
+ BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+ }
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c
new file mode 100644
index 000000000..b548d5354
--- /dev/null
+++ b/relapack/src/ctrsyl.c
@@ -0,0 +1,163 @@
+#include "relapack.h"
+
+static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
+ const int *, const int *, const float *, const int *, const float *,
+ const int *, float *, const int *, float *, int *);
+
+
+/** CTRSYL solves the complex Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ctrsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
+ * */
+void RELAPACK_ctrsyl(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+
+ // Check arguments
+ const int notransA = LAPACK(lsame)(tranA, "N");
+ const int ctransA = LAPACK(lsame)(tranA, "C");
+ const int notransB = LAPACK(lsame)(tranB, "N");
+ const int ctransB = LAPACK(lsame)(tranB, "C");
+ *info = 0;
+ if (!ctransA && !notransA)
+ *info = -1;
+ else if (!ctransB && !notransB)
+ *info = -2;
+ else if (*isgn != 1 && *isgn != -1)
+ *info = -3;
+ else if (*m < 0)
+ *info = -4;
+ else if (*n < 0)
+ *info = -5;
+ else if (*ldA < MAX(1, *m))
+ *info = -7;
+ else if (*ldB < MAX(1, *n))
+ *info = -9;
+ else if (*ldC < MAX(1, *m))
+ *info = -11;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CTRSYL", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantranA = notransA ? 'N' : 'C';
+ const char cleantranB = notransB ? 'N' : 'C';
+
+ // Recursive kernel
+ RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** ctrsyl's recursive compute kernel */
+static void RELAPACK_ctrsyl_rec(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
+ // Unblocked
+ RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+ const float MSGN[] = { -*isgn, 0. };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ float scale1[] = { 1., 0. };
+ float scale2[] = { 1., 0. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ const int m1 = CREC_SPLIT(*m);
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const float *const A_TL = A;
+ const float *const A_TR = A + 2 * *ldA * m1;
+ const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+ // C_T
+ // C_B
+ float *const C_T = C;
+ float *const C_B = C + 2 * m1;
+
+ if (*tranA == 'N') {
+ // recusion(A_BR, B, C_B)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // recusion(A_TL, B, C_T)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ } else {
+ // recusion(A_TL, B, C_T)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+ // C_B = C_B - A_TR' * C_T
+ BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // recusion(A_BR, B, C_B)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+ }
+ } else {
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const float *const B_TL = B;
+ const float *const B_TR = B + 2 * *ldB * n1;
+ const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+ // C_L C_R
+ float *const C_L = C;
+ float *const C_R = C + 2 * *ldC * n1;
+
+ if (*tranB == 'N') {
+ // recusion(A, B_TL, C_L)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+ // C_R = C_R -/+ C_L * B_TR
+ BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+ // recusion(A, B_BR, C_R)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ } else {
+ // recusion(A, B_BR, C_R)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+ // C_L = C_L -/+ C_R * B_TR'
+ BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+ // recusion(A, B_TL, C_L)
+ RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c
new file mode 100644
index 000000000..518574868
--- /dev/null
+++ b/relapack/src/ctrsyl_rec2.c
@@ -0,0 +1,392 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "../config.h"
+#include "f2c.h"
+
+#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
+ extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
+ complex result;
+ cdotu_(&result, n, x, incx, y, incy);
+ return result;
+}
+#define cdotu_ cdotu_fun
+
+complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
+ extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
+ complex result;
+ cdotc_(&result, n, x, incx, y, incy);
+ return result;
+}
+#define cdotc_ cdotc_fun
+#endif
+
+#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+complex cladiv_fun(complex *a, complex *b) {
+ extern void cladiv_(complex *, complex *, complex *);
+ complex result;
+ cladiv_(&result, a, b);
+ return result;
+}
+#define cladiv_ cladiv_fun
+#endif
+
+/* Table of constant values */
+
+static int c__1 = 1;
+
+/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
+ *
+ * This routine is an exact copy of LAPACK's ctrsyl.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * */
+/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
+ *isgn, int *m, int *n, complex *a, int *lda, complex *b,
+ int *ldb, complex *c__, int *ldc, float *scale, int *info,
+ ftnlen trana_len, ftnlen tranb_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+ i__3, i__4;
+ float r__1, r__2;
+ complex q__1, q__2, q__3, q__4;
+
+ /* Builtin functions */
+ float r_imag(complex *);
+ void r_cnjg(complex *, complex *);
+
+ /* Local variables */
+ static int j, k, l;
+ static complex a11;
+ static float db;
+ static complex x11;
+ static float da11;
+ static complex vec;
+ static float dum[1], eps, sgn, smin;
+ static complex suml, sumr;
+ /* Complex */ complex cdotc_(int *, complex *, int
+ *, complex *, int *);
+ extern int lsame_(char *, char *, ftnlen, ftnlen);
+ /* Complex */ complex cdotu_(int *, complex *, int
+ *, complex *, int *);
+ extern /* Subroutine */ int slabad_(float *, float *);
+ extern float clange_(char *, int *, int *, complex *,
+ int *, float *, ftnlen);
+ /* Complex */ complex cladiv_(complex *, complex *);
+ static float scaloc;
+ extern float slamch_(char *, ftnlen);
+ extern /* Subroutine */ int csscal_(int *, float *, complex *, int
+ *), xerbla_(char *, int *, ftnlen);
+ static float bignum;
+ static int notrna, notrnb;
+ static float smlnum;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = *ldb;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+ c_dim1 = *ldc;
+ c_offset = 1 + c_dim1;
+ c__ -= c_offset;
+
+ /* Function Body */
+ notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+ notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+ *info = 0;
+ if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -1;
+ } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -2;
+ } else if (*isgn != 1 && *isgn != -1) {
+ *info = -3;
+ } else if (*m < 0) {
+ *info = -4;
+ } else if (*n < 0) {
+ *info = -5;
+ } else if (*lda < max(1,*m)) {
+ *info = -7;
+ } else if (*ldb < max(1,*n)) {
+ *info = -9;
+ } else if (*ldc < max(1,*m)) {
+ *info = -11;
+ }
+ if (*info != 0) {
+ i__1 = -(*info);
+ xerbla_("CTRSY2", &i__1, (ftnlen)6);
+ return;
+ }
+ *scale = 1.f;
+ if (*m == 0 || *n == 0) {
+ return;
+ }
+ eps = slamch_("P", (ftnlen)1);
+ smlnum = slamch_("S", (ftnlen)1);
+ bignum = 1.f / smlnum;
+ slabad_(&smlnum, &bignum);
+ smlnum = smlnum * (float) (*m * *n) / eps;
+ bignum = 1.f / smlnum;
+/* Computing MAX */
+ r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
+ ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
+ &b[b_offset], ldb, dum, (ftnlen)1);
+ smin = dmax(r__1,r__2);
+ sgn = (float) (*isgn);
+ if (notrna && notrnb) {
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ for (k = *m; k >= 1; --k) {
+ i__2 = *m - k;
+/* Computing MIN */
+ i__3 = k + 1;
+/* Computing MIN */
+ i__4 = k + 1;
+ q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
+ min(i__4,*m) + l * c_dim1], &c__1);
+ suml.r = q__1.r, suml.i = q__1.i;
+ i__2 = l - 1;
+ q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+ , &c__1);
+ sumr.r = q__1.r, sumr.i = q__1.i;
+ i__2 = k + l * c_dim1;
+ q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
+ q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+ q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
+ vec.r = q__1.r, vec.i = q__1.i;
+ scaloc = 1.f;
+ i__2 = k + k * a_dim1;
+ i__3 = l + l * b_dim1;
+ q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
+ q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
+ a11.r = q__1.r, a11.i = q__1.i;
+ da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+ dabs(r__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.f;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+ r__2));
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ q__3.r = scaloc, q__3.i = 0.f;
+ q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+ q__3.i + vec.i * q__3.r;
+ q__1 = cladiv_(&q__2, &a11);
+ x11.r = q__1.r, x11.i = q__1.i;
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+ }
+ *scale *= scaloc;
+ }
+ i__2 = k + l * c_dim1;
+ c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L20: */
+ }
+/* L30: */
+ }
+ } else if (! notrna && notrnb) {
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ i__2 = *m;
+ for (k = 1; k <= i__2; ++k) {
+ i__3 = k - 1;
+ q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
+ c_dim1 + 1], &c__1);
+ suml.r = q__1.r, suml.i = q__1.i;
+ i__3 = l - 1;
+ q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+ , &c__1);
+ sumr.r = q__1.r, sumr.i = q__1.i;
+ i__3 = k + l * c_dim1;
+ q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
+ q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+ q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
+ vec.r = q__1.r, vec.i = q__1.i;
+ scaloc = 1.f;
+ r_cnjg(&q__2, &a[k + k * a_dim1]);
+ i__3 = l + l * b_dim1;
+ q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
+ q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+ a11.r = q__1.r, a11.i = q__1.i;
+ da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+ dabs(r__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.f;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+ r__2));
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ q__3.r = scaloc, q__3.i = 0.f;
+ q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+ q__3.i + vec.i * q__3.r;
+ q__1 = cladiv_(&q__2, &a11);
+ x11.r = q__1.r, x11.i = q__1.i;
+ if (scaloc != 1.f) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+ }
+ *scale *= scaloc;
+ }
+ i__3 = k + l * c_dim1;
+ c__[i__3].r = x11.r, c__[i__3].i = x11.i;
+/* L50: */
+ }
+/* L60: */
+ }
+ } else if (! notrna && ! notrnb) {
+ for (l = *n; l >= 1; --l) {
+ i__1 = *m;
+ for (k = 1; k <= i__1; ++k) {
+ i__2 = k - 1;
+ q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
+ c_dim1 + 1], &c__1);
+ suml.r = q__1.r, suml.i = q__1.i;
+ i__2 = *n - l;
+/* Computing MIN */
+ i__3 = l + 1;
+/* Computing MIN */
+ i__4 = l + 1;
+ q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
+ l + min(i__4,*n) * b_dim1], ldb);
+ sumr.r = q__1.r, sumr.i = q__1.i;
+ i__2 = k + l * c_dim1;
+ r_cnjg(&q__4, &sumr);
+ q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
+ q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+ q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
+ vec.r = q__1.r, vec.i = q__1.i;
+ scaloc = 1.f;
+ i__2 = k + k * a_dim1;
+ i__3 = l + l * b_dim1;
+ q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
+ q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
+ r_cnjg(&q__1, &q__2);
+ a11.r = q__1.r, a11.i = q__1.i;
+ da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+ dabs(r__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.f;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+ r__2));
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ q__3.r = scaloc, q__3.i = 0.f;
+ q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+ q__3.i + vec.i * q__3.r;
+ q__1 = cladiv_(&q__2, &a11);
+ x11.r = q__1.r, x11.i = q__1.i;
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L70: */
+ }
+ *scale *= scaloc;
+ }
+ i__2 = k + l * c_dim1;
+ c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L80: */
+ }
+/* L90: */
+ }
+ } else if (notrna && ! notrnb) {
+ for (l = *n; l >= 1; --l) {
+ for (k = *m; k >= 1; --k) {
+ i__1 = *m - k;
+/* Computing MIN */
+ i__2 = k + 1;
+/* Computing MIN */
+ i__3 = k + 1;
+ q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
+ min(i__3,*m) + l * c_dim1], &c__1);
+ suml.r = q__1.r, suml.i = q__1.i;
+ i__1 = *n - l;
+/* Computing MIN */
+ i__2 = l + 1;
+/* Computing MIN */
+ i__3 = l + 1;
+ q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
+ l + min(i__3,*n) * b_dim1], ldb);
+ sumr.r = q__1.r, sumr.i = q__1.i;
+ i__1 = k + l * c_dim1;
+ r_cnjg(&q__4, &sumr);
+ q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
+ q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
+ q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
+ vec.r = q__1.r, vec.i = q__1.i;
+ scaloc = 1.f;
+ i__1 = k + k * a_dim1;
+ r_cnjg(&q__3, &b[l + l * b_dim1]);
+ q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
+ q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
+ a11.r = q__1.r, a11.i = q__1.i;
+ da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
+ dabs(r__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.f;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
+ r__2));
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ q__3.r = scaloc, q__3.i = 0.f;
+ q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
+ q__3.i + vec.i * q__3.r;
+ q__1 = cladiv_(&q__2, &a11);
+ x11.r = q__1.r, x11.i = q__1.i;
+ if (scaloc != 1.f) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+ }
+ *scale *= scaloc;
+ }
+ i__1 = k + l * c_dim1;
+ c__[i__1].r = x11.r, c__[i__1].i = x11.i;
+/* L110: */
+ }
+/* L120: */
+ }
+ }
+ return;
+}
diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c
new file mode 100644
index 000000000..0262cb59d
--- /dev/null
+++ b/relapack/src/ctrtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
+ float *, const int *, int *);
+
+
+/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's ctrtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
+ * */
+void RELAPACK_ctrtri(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int nounit = LAPACK(lsame)(diag, "N");
+ const int unit = LAPACK(lsame)(diag, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (!nounit && !unit)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("CTRTRI", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleandiag = nounit ? 'N' : 'U';
+
+ // check for singularity
+ if (nounit) {
+ int i;
+ for (i = 0; i < *n; i++)
+ if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
+ *info = i;
+ return;
+ }
+ }
+
+ // Recursive kernel
+ RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** ctrtri's recursive compute kernel */
+static void RELAPACK_ctrtri_rec(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
+ // Unblocked
+ LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1., 0. };
+ const float MONE[] = { -1., 0. };
+
+ // Splitting
+ const int n1 = CREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + 2 * *ldA * n1;
+ float *const A_BL = A + 2 * n1;
+ float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = - A_BL * A_TL
+ BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+ // A_BL = A_BR \ A_BL
+ BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TR = - A_TL * A_TR
+ BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+ // A_TR = A_TR / A_BR
+ BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c
new file mode 100644
index 000000000..1a1757d31
--- /dev/null
+++ b/relapack/src/dgbtrf.c
@@ -0,0 +1,227 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
+ const int *, double *, const int *, int *, double *, const int *, double *,
+ const int *, int *);
+
+
+/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's dgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
+ * */
+void RELAPACK_dgbtrf(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kl < 0)
+ *info = -3;
+ else if (*ku < 0)
+ *info = -4;
+ else if (*ldAb < 2 * *kl + *ku + 1)
+ *info = -6;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DGBTRF", &minfo);
+ return;
+ }
+
+ // Constant
+ const double ZERO[] = { 0. };
+
+ // Result upper band width
+ const int kv = *ku + *kl;
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ double *const A = Ab + kv;
+
+ // Zero upper diagonal fill-in elements
+ int i, j;
+ for (j = 0; j < *n; j++) {
+ double *const A_j = A + *ldA * j;
+ for (i = MAX(0, j - kv); i < j - *ku; i++)
+ A_j[i] = 0.;
+ }
+
+ // Allocate work space
+ const int n1 = DREC_SPLIT(*n);
+ const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+ const int nWorkl = (kv > n1) ? n1 : kv;
+ const int mWorku = (*kl > n1) ? n1 : *kl;
+ const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+ double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
+ double *Worku = malloc(mWorku * nWorku * sizeof(double));
+ LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+ LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+ // Recursive kernel
+ RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+ // Free work space
+ free(Workl);
+ free(Worku);
+}
+
+
+/** dgbtrf's recursive compute kernel */
+static void RELAPACK_dgbtrf_rec(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
+ // Unblocked
+ LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Loop iterators
+ int i, j;
+
+ // Output upper band width
+ const int kv = *ku + *kl;
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ double *const A = Ab + kv;
+
+ // Splitting
+ const int n1 = MIN(DREC_SPLIT(*n), *kl);
+ const int n2 = *n - n1;
+ const int m1 = MIN(n1, *m);
+ const int m2 = *m - m1;
+ const int mn1 = MIN(m1, n1);
+ const int mn2 = MIN(m2, n2);
+
+ // Ab_L *
+ // Ab_BR
+ double *const Ab_L = Ab;
+ double *const Ab_BR = Ab + *ldAb * n1;
+
+ // A_L A_R
+ double *const A_L = A;
+ double *const A_R = A + *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + *ldA * n1;
+ double *const A_BL = A + m1;
+ double *const A_BR = A + *ldA * n1 + m1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // Banded splitting
+ const int n21 = MIN(n2, kv - n1);
+ const int n22 = MIN(n2 - n21, n1);
+ const int m21 = MIN(m2, *kl - m1);
+ const int m22 = MIN(m2 - m21, m1);
+
+ // n1 n21 n22
+ // m * A_Rl ARr
+ double *const A_Rl = A_R;
+ double *const A_Rr = A_R + *ldA * n21;
+
+ // n1 n21 n22
+ // m1 * A_TRl A_TRr
+ // m21 A_BLt A_BRtl A_BRtr
+ // m22 A_BLb A_BRbl A_BRbr
+ double *const A_TRl = A_TR;
+ double *const A_TRr = A_TR + *ldA * n21;
+ double *const A_BLt = A_BL;
+ double *const A_BLb = A_BL + m21;
+ double *const A_BRtl = A_BR;
+ double *const A_BRtr = A_BR + *ldA * n21;
+ double *const A_BRbl = A_BR + m21;
+ double *const A_BRbr = A_BR + *ldA * n21 + m21;
+
+ // recursion(Ab_L, ipiv_T)
+ RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+ // Workl = A_BLb
+ LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+ // partially redo swaps in A_L
+ for (i = 0; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+ else
+ BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+ }
+ }
+
+ // apply pivots to A_Rl
+ LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+ // apply pivots to A_Rr columnwise
+ for (j = 0; j < n22; j++) {
+ double *const A_Rrj = A_Rr + *ldA * j;
+ for (i = j; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ const double tmp = A_Rrj[i];
+ A_Rrj[i] = A_Rr[ip];
+ A_Rrj[ip] = tmp;
+ }
+ }
+ }
+
+ // A_TRl = A_TL \ A_TRl
+ BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // Worku = A_TRr
+ LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+ // Worku = A_TL \ Worku
+ BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+ // A_TRr = Worku
+ LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_TRl
+ BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // A_BRbl = A_BRbl - Workl * A_TRl
+ BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+ // A_BRtr = A_BRtr - A_BLt * Worku
+ BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Workl * Worku
+ BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+ // partially undo swaps in A_L
+ for (i = mn1 - 1; i >= 0; i--) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+ else
+ BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+ }
+ }
+
+ // recursion(Ab_BR, ipiv_B)
+ RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ if (*info)
+ *info += n1;
+ // shift pivots
+ for (i = 0; i < mn2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c
new file mode 100644
index 000000000..9c925b586
--- /dev/null
+++ b/relapack/src/dgemmt.c
@@ -0,0 +1,165 @@
+#include "relapack.h"
+
+static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
+ const int *, const int *, const double *, const double *, const int *,
+ const double *, const int *, const double *, double *, const int *);
+
+static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
+ const int *, const int *, const double *, const double *, const int *,
+ const double *, const int *, const double *, double *, const int *);
+
+
+/** DGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_dgemmt(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+ BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+#else
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int notransA = LAPACK(lsame)(transA, "N");
+ const int tranA = LAPACK(lsame)(transA, "T");
+ const int notransB = LAPACK(lsame)(transB, "N");
+ const int tranB = LAPACK(lsame)(transB, "T");
+ int info = 0;
+ if (!lower && !upper)
+ info = 1;
+ else if (!tranA && !notransA)
+ info = 2;
+ else if (!tranB && !notransB)
+ info = 3;
+ else if (*n < 0)
+ info = 4;
+ else if (*k < 0)
+ info = 5;
+ else if (*ldA < MAX(1, notransA ? *n : *k))
+ info = 8;
+ else if (*ldB < MAX(1, notransB ? *k : *n))
+ info = 10;
+ else if (*ldC < MAX(1, *n))
+ info = 13;
+ if (info) {
+ LAPACK(xerbla)("DGEMMT", &info);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleantransA = notransA ? 'N' : 'T';
+ const char cleantransB = notransB ? 'N' : 'T';
+
+ // Recursive kernel
+ RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** dgemmt's recursive compute kernel */
+static void RELAPACK_dgemmt_rec(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+
+ if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
+ // Unblocked
+ RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+ }
+
+ // Splitting
+ const int n1 = DREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_T
+ // A_B
+ const double *const A_T = A;
+ const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
+
+ // B_L B_R
+ const double *const B_L = B;
+ const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
+
+ // C_TL C_TR
+ // C_BL C_BR
+ double *const C_TL = C;
+ double *const C_TR = C + *ldC * n1;
+ double *const C_BL = C + n1;
+ double *const C_BR = C + *ldC * n1 + n1;
+
+ // recursion(C_TL)
+ RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+ if (*uplo == 'L')
+ // C_BL = alpha A_B B_L + beta C_BL
+ BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+ else
+ // C_TR = alpha A_T B_R + beta C_TR
+ BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+ // recursion(C_BR)
+ RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** dgemmt's unblocked compute kernel */
+static void RELAPACK_dgemmt_rec2(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+
+ const int incB = (*transB == 'N') ? 1 : *ldB;
+ const int incC = 1;
+
+ int i;
+ for (i = 0; i < *n; i++) {
+ // A_0
+ // A_i
+ const double *const A_0 = A;
+ const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
+
+ // * B_i *
+ const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
+
+ // * C_0i *
+ // * C_ii *
+ double *const C_0i = C + *ldC * i;
+ double *const C_ii = C + *ldC * i + i;
+
+ if (*uplo == 'L') {
+ const int nmi = *n - i;
+ if (*transA == 'N')
+ BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ else
+ BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ } else {
+ const int ip1 = i + 1;
+ if (*transA == 'N')
+ BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ else
+ BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ }
+ }
+}
diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c
new file mode 100644
index 000000000..07f5472fd
--- /dev/null
+++ b/relapack/src/dgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
+ const int *, int *, int *);
+
+
+/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's dgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
+ * */
+void RELAPACK_dgetrf(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DGETRF", &minfo);
+ return;
+ }
+
+ const int sn = MIN(*m, *n);
+
+ RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+ // Right remainder
+ if (*m < *n) {
+ // Constants
+ const double ONE[] = { 1. };
+ const int iONE[] = { 1. };
+
+ // Splitting
+ const int rn = *n - *m;
+
+ // A_L A_R
+ const double *const A_L = A;
+ double *const A_R = A + *ldA * *m;
+
+ // A_R = apply(ipiv, A_R)
+ LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+ // A_R = A_S \ A_R
+ BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+ }
+}
+
+
+/** dgetrf's recursive compute kernel */
+static void RELAPACK_dgetrf_rec(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
+ // Unblocked
+ LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Splitting
+ const int n1 = DREC_SPLIT(*n);
+ const int n2 = *n - n1;
+ const int m2 = *m - n1;
+
+ // A_L A_R
+ double *const A_L = A;
+ double *const A_R = A + *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + *ldA * n1;
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // recursion(A_L, ipiv_T)
+ RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+ // apply pivots to A_R
+ LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+ // A_TR = A_TL \ A_TR
+ BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_BL * A_TR
+ BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+ // recursion(A_BR, ipiv_B)
+ RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+ if (*info)
+ *info += n1;
+ // apply pivots to A_BL
+ LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c
new file mode 100644
index 000000000..d722ea809
--- /dev/null
+++ b/relapack/src/dlauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_dlauum_rec(const char *, const int *, double *,
+ const int *, int *);
+
+
+/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's dlauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
+ * */
+void RELAPACK_dlauum(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DLAUUM", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** dlauum's recursive compute kernel */
+static void RELAPACK_dlauum_rec(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
+ // Unblocked
+ LAPACK(dlauu2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+
+ // Splitting
+ const int n1 = DREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + *ldA * n1;
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);
+
+ if (*uplo == 'L') {
+ // A_TL = A_TL + A_BL' * A_BL
+ BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+ // A_BL = A_BR' * A_BL
+ BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TL = A_TL + A_TR * A_TR'
+ BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+ // A_TR = A_TR * A_BR'
+ BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c
new file mode 100644
index 000000000..6fd0ebe48
--- /dev/null
+++ b/relapack/src/dpbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
+ double *, const int *, double *, const int *, int *);
+
+
+/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
+ * */
+void RELAPACK_dpbtrf(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kd < 0)
+ *info = -3;
+ else if (*ldAb < *kd + 1)
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DPBTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Constant
+ const double ZERO[] = { 0. };
+
+ // Allocate work space
+ const int n1 = DREC_SPLIT(*n);
+ const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+ const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+ double *Work = malloc(mWork * nWork * sizeof(double));
+ LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+ // Recursive kernel
+ RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+ // Free work space
+ free(Work);
+}
+
+
+/** dpbtrf's recursive compute kernel */
+static void RELAPACK_dpbtrf_rec(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ double *Work, const int *ldWork,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
+ // Unblocked
+ LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
+
+ // Splitting
+ const int n1 = MIN(DREC_SPLIT(*n), *kd);
+ const int n2 = *n - n1;
+
+ // * *
+ // * Ab_BR
+ double *const Ab_BR = Ab + *ldAb * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + *ldA * n1;
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ // Banded splitting
+ const int n21 = MIN(n2, *kd - n1);
+ const int n22 = MIN(n2 - n21, n1);
+
+ // n1 n21 n22
+ // n1 * A_TRl A_TRr
+ // n21 A_BLt A_BRtl A_BRtr
+ // n22 A_BLb A_BRbl A_BRbr
+ double *const A_TRl = A_TR;
+ double *const A_TRr = A_TR + *ldA * n21;
+ double *const A_BLt = A_BL;
+ double *const A_BLb = A_BL + n21;
+ double *const A_BRtl = A_BR;
+ double *const A_BRtr = A_BR + *ldA * n21;
+ double *const A_BRbl = A_BR + n21;
+ double *const A_BRbr = A_BR + *ldA * n21 + n21;
+
+ if (*uplo == 'L') {
+ // A_BLt = ABLt / A_TL'
+ BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_BLt'
+ BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+ // Work = A_BLb
+ LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+ // Work = Work / A_TL'
+ BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRbl = A_BRbl - Work * A_BLt'
+ BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+ // A_BRbr = A_BRbr - Work * Work'
+ BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_BLb = Work
+ LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+ } else {
+ // A_TRl = A_TL' \ A_TRl
+ BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // A_BRtl = A_BRtl - A_TRl' * A_TRl
+ BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // Work = A_TRr
+ LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+ // Work = A_TL' \ Work
+ BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRtr = A_BRtr - A_TRl' * Work
+ BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Work' * Work
+ BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_TRr = Work
+ LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+ }
+
+ // recursion(A_BR)
+ if (*kd > n1)
+ RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
+ else
+ RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c
new file mode 100644
index 000000000..c14fb3d71
--- /dev/null
+++ b/relapack/src/dpotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
+ const int *, int *);
+
+
+/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
+ * */
+void RELAPACK_dpotrf(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DPOTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** dpotrf's recursive compute kernel */
+static void RELAPACK_dpotrf_rec(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
+ // Unblocked
+ LAPACK(dpotf2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+
+ // Splitting
+ const int n1 = DREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + *ldA * n1;
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = A_BL / A_TL'
+ BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+ // A_BR = A_BR - A_BL * A_BL'
+ BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+ } else {
+ // A_TR = A_TL' \ A_TR
+ BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * A_TR
+ BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c
new file mode 100644
index 000000000..0228068ce
--- /dev/null
+++ b/relapack/src/dsygst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_dsygst_rec(const int *, const char *, const int *,
+ double *, const int *, const double *, const int *,
+ double *, const int *, int *);
+
+
+/** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's dsygst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html
+ * */
+void RELAPACK_dsygst(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (*itype < 1 || *itype > 3)
+ *info = -1;
+ else if (!lower && !upper)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ else if (*ldB < MAX(1, *n))
+ *info = -7;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DSYGST", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Allocate work space
+ double *Work = NULL;
+ int lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+ const int n1 = DREC_SPLIT(*n);
+ lWork = n1 * (*n - n1);
+ Work = malloc(lWork * sizeof(double));
+ if (!Work)
+ lWork = 0;
+#endif
+
+ // recursive kernel
+ RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+ // Free work space
+#if XSYGST_ALLOW_MALLOC
+ if (Work)
+ free(Work);
+#endif
+}
+
+
+/** dsygst's recursive compute kernel */
+static void RELAPACK_dsygst_rec(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ double *Work, const int *lWork, int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
+ // Unblocked
+ LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info);
+ return;
+ }
+
+ // Constants
+ const double ZERO[] = { 0. };
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+ const double HALF[] = { .5 };
+ const double MHALF[] = { -.5 };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ // Splitting
+ const int n1 = DREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + *ldA * n1;
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+
+ // B_TL B_TR
+ // B_BL B_BR
+ const double *const B_TL = B;
+ const double *const B_TR = B + *ldB * n1;
+ const double *const B_BL = B + n1;
+ const double *const B_BR = B + *ldB * n1 + n1;
+
+ // recursion(A_TL, B_TL)
+ RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+ if (*itype == 1)
+ if (*uplo == 'L') {
+ // A_BL = A_BL / B_TL'
+ BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork > n2 * n1) {
+ // T = -1/2 * B_BL * A_TL
+ BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ } else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+ BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+ if (*lWork > n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR \ A_BL
+ BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL' \ A_TR
+ BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork > n2 * n1) {
+ // T = -1/2 * A_TL * B_TR
+ BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ } else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+ BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+ if (*lWork > n2 * n1)
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR / B_BR
+ BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+ else
+ if (*uplo == 'L') {
+ // A_BL = A_BL * B_TL
+ BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork > n2 * n1) {
+ // T = 1/2 * A_BR * B_BL
+ BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ } else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+ BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+ if (*lWork > n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR * A_BL
+ BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL * A_TR
+ BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork > n2 * n1) {
+ // T = 1/2 * B_TR * A_BR
+ BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ } else
+ // A_TR = A_TR + 1/2 B_TR A_BR
+ BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+ BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+ if (*lWork > n2 * n1)
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ else
+ // A_TR = A_TR + 1/2 B_TR * A_BR
+ BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR * B_BR
+ BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+
+ // recursion(A_BR, B_BR)
+ RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c
new file mode 100644
index 000000000..80b119336
--- /dev/null
+++ b/relapack/src/dsytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *,
+ double *, const int *, int *, double *, const int *, int *);
+
+
+/** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's dsytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html
+ * */
+void RELAPACK_dsytrf(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * sizeof(double));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy arguments
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** dsytrf's recursive compute kernel */
+static void RELAPACK_dsytrf_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_DSYTRF, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = DREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ double *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+ double *const A_BL_B = A + *n;
+ double *const A_BR_B = A + *ldA * n1 + *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ double *const Work_BL = Work + n1;
+ double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+ // last row of A_BL
+ double *const A_BL_b = A_BL + n2_out;
+
+ // last row of Work_BL
+ double *const Work_BL_b = Work_BL + n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = DREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ double *const Work_R = top ? Work : Work + *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ double *const A_TL_T = A + *ldA * n_rest;
+ double *const A_TR_T = A + *ldA * (n_rest + n1);
+ double *const A_TL = A + *ldA * n_rest + n_rest;
+ double *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ double *const Work_L = Work;
+ double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c
new file mode 100644
index 000000000..72ef827b1
--- /dev/null
+++ b/relapack/src/dsytrf_rec2.c
@@ -0,0 +1,352 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static double c_b8 = -1.;
+static double c_b9 = 1.;
+
+/** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's dlasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int *
+ nb, int *kb, double *a, int *lda, int *ipiv,
+ double *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+ double d__1, d__2, d__3;
+
+ /* Builtin functions */
+ double sqrt(double);
+
+ /* Local variables */
+ static int j, k;
+ static double t, r1, d11, d21, d22;
+ static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+ static double alpha;
+ extern /* Subroutine */ int dscal_(int *, double *, double *,
+ int *);
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int dgemv_(char *, int *, int *,
+ double *, double *, int *, double *, int *,
+ double *, double *, int *, ftnlen), dcopy_(int *,
+ double *, int *, double *, int *), dswap_(int
+ *, double *, int *, double *, int *);
+ static int kstep;
+ static double absakk;
+ extern int idamax_(int *, double *, int *);
+ static double colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.) + 1.) / 8.;
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ kstep = 1;
+ absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ i__1 = k - imax;
+ jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+ &c__1);
+ rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
+ if (imax > 1) {
+ i__1 = imax - 1;
+ jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+ d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1],
+ abs(d__1));
+ rowmax = max(d__2,d__3);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >=
+ alpha * rowmax) {
+ kp = imax;
+ dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kp != kk) {
+ a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+ i__1 = kk - 1 - kp;
+ dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ r1 = 1. / a[k + k * a_dim1];
+ i__1 = k - 1;
+ dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (k > 2) {
+ d21 = w[k - 1 + kw * w_dim1];
+ d11 = w[k + kw * w_dim1] / d21;
+ d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
+ t = 1. / (d11 * d22 - 1.);
+ d21 = t / d21;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
+ * w_dim1] - w[j + kw * w_dim1]);
+ a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
+ w[j + (kw - 1) * w_dim1]);
+/* L20: */
+ }
+ }
+ a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+ a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+ a[k + k * a_dim1] = w[k + kw * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ ++j;
+ }
+ ++j;
+ if (jp != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+ }
+ if (j < *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+ + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+ kstep = 1;
+ absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - k;
+ dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
+ lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
+ w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax - k;
+ jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+ ;
+ rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
+ if (imax < *n) {
+ i__1 = *n - imax;
+ jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+/* Computing MAX */
+ d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1],
+ abs(d__1));
+ rowmax = max(d__2,d__3);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >=
+ alpha * rowmax) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kp != kk) {
+ a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+ i__1 = kp - kk - 1;
+ dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ r1 = 1. / a[k + k * a_dim1];
+ i__1 = *n - k;
+ dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ d21 = w[k + 1 + k * w_dim1];
+ d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+ d22 = w[k + k * w_dim1] / d21;
+ t = 1. / (d11 * d22 - 1.);
+ d21 = t / d21;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
+ w[j + (k + 1) * w_dim1]);
+ a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
+ w_dim1] - w[j + k * w_dim1]);
+/* L80: */
+ }
+ }
+ a[k + k * a_dim1] = w[k + k * w_dim1];
+ a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+ a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ --j;
+ }
+ --j;
+ if (jp != jj && j >= 1) {
+ dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j > 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c
new file mode 100644
index 000000000..19a875c7a
--- /dev/null
+++ b/relapack/src/dsytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *,
+ double *, const int *, int *, double *, const int *, int *);
+
+
+/** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's dsytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html
+ * */
+void RELAPACK_dsytrf_rook(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * sizeof(double));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** dsytrf_rook's recursive compute kernel */
+static void RELAPACK_dsytrf_rook_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = DREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ double *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+ double *const A_BL_B = A + *n;
+ double *const A_BR_B = A + *ldA * n1 + *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ double *const Work_BL = Work + n1;
+ double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ double *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+ // last row of A_BL
+ double *const A_BL_b = A_BL + n2_out;
+
+ // last row of Work_BL
+ double *const Work_BL_b = Work_BL + n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = DREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ double *const Work_R = top ? Work : Work + *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ double *const A_TL_T = A + *ldA * n_rest;
+ double *const A_TR_T = A + *ldA * (n_rest + n1);
+ double *const A_TL = A + *ldA * n_rest + n_rest;
+ double *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ double *const Work_L = Work;
+ double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c
new file mode 100644
index 000000000..105ef5ed3
--- /dev/null
+++ b/relapack/src/dsytrf_rook_rec2.c
@@ -0,0 +1,451 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static double c_b9 = -1.;
+static double c_b10 = 1.;
+
+/** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's dlasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n,
+ int *nb, int *kb, double *a, int *lda, int *ipiv,
+ double *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+ double d__1;
+
+ /* Builtin functions */
+ double sqrt(double);
+
+ /* Local variables */
+ static int j, k, p;
+ static double t, r1, d11, d12, d21, d22;
+ static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+ static logical done;
+ static int imax, jmax;
+ static double alpha;
+ extern /* Subroutine */ int dscal_(int *, double *, double *,
+ int *);
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int dgemv_(char *, int *, int *,
+ double *, double *, int *, double *, int *,
+ double *, double *, int *, ftnlen);
+ static double dtemp, sfmin;
+ static int itemp;
+ extern /* Subroutine */ int dcopy_(int *, double *, int *,
+ double *, int *), dswap_(int *, double *, int
+ *, double *, int *);
+ static int kstep;
+ extern double dlamch_(char *, ftnlen);
+ static double absakk;
+ extern int idamax_(int *, double *, int *);
+ static double colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.) + 1.) / 8.;
+ sfmin = dlamch_("S", (ftnlen)1);
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ p = k;
+ dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ absakk = (d__1 = w[k + kw * w_dim1], abs(d__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L12:
+ dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = k - imax;
+ jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) *
+ w_dim1], &c__1);
+ rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1));
+ } else {
+ rowmax = 0.;
+ }
+ if (imax > 1) {
+ i__1 = imax - 1;
+ itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1));
+ if (dtemp > rowmax) {
+ rowmax = dtemp;
+ jmax = itemp;
+ }
+ }
+ if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) <
+ alpha * rowmax)) {
+ kp = imax;
+ dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ }
+ if (! done) {
+ goto L12;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kstep == 2 && p != k) {
+ i__1 = k - p;
+ dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+ a_dim1], lda);
+ dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - k + 1;
+ dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+ ldw);
+ }
+ if (kp != kk) {
+ a[kp + k * a_dim1] = a[kk + k * a_dim1];
+ i__1 = k - 1 - kp;
+ dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - kk + 1;
+ dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
+ r1 = 1. / a[k + k * a_dim1];
+ i__1 = k - 1;
+ dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else if (a[k + k * a_dim1] != 0.) {
+ i__1 = k - 1;
+ for (ii = 1; ii <= i__1; ++ii) {
+ a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L14: */
+ }
+ }
+ }
+ } else {
+ if (k > 2) {
+ d12 = w[k - 1 + kw * w_dim1];
+ d11 = w[k + kw * w_dim1] / d12;
+ d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
+ t = 1. / (d11 * d22 - 1.);
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
+ w_dim1] - w[j + kw * w_dim1]) / d12);
+ a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
+ w[j + (kw - 1) * w_dim1]) / d12);
+/* L20: */
+ }
+ }
+ a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+ a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+ a[k + k * a_dim1] = w[k + kw * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ ++j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ ++j;
+ if (jp2 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ jj = j - 1;
+ if (jp1 != jj && kstep == 2) {
+ i__1 = *n - j + 1;
+ dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ if (j <= *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ p = k;
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
+ w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
+ ftnlen)12);
+ }
+ absakk = (d__1 = w[k + k * w_dim1], abs(d__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ colmax = (d__1 = w[imax + k * w_dim1], abs(d__1));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L72:
+ i__1 = imax - k;
+ dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
+ , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+ + 1) * w_dim1], &c__1, (ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = imax - k;
+ jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+ c__1);
+ rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1));
+ } else {
+ rowmax = 0.;
+ }
+ if (imax < *n) {
+ i__1 = *n - imax;
+ itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+ dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1));
+ if (dtemp > rowmax) {
+ rowmax = dtemp;
+ jmax = itemp;
+ }
+ }
+ if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha
+ * rowmax)) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ }
+ if (! done) {
+ goto L72;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kstep == 2 && p != k) {
+ i__1 = p - k;
+ dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - p + 1;
+ dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+ c__1);
+ dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+ dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+ }
+ if (kp != kk) {
+ a[kp + k * a_dim1] = a[kk + k * a_dim1];
+ i__1 = kp - k - 1;
+ dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+ * a_dim1], lda);
+ i__1 = *n - kp + 1;
+ dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+ a_dim1], &c__1);
+ dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) {
+ r1 = 1. / a[k + k * a_dim1];
+ i__1 = *n - k;
+ dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ } else if (a[k + k * a_dim1] != 0.) {
+ i__1 = *n;
+ for (ii = k + 1; ii <= i__1; ++ii) {
+ a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L74: */
+ }
+ }
+ }
+ } else {
+ if (k < *n - 1) {
+ d21 = w[k + 1 + k * w_dim1];
+ d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+ d22 = w[k + k * w_dim1] / d21;
+ t = 1. / (d11 * d22 - 1.);
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
+ j + (k + 1) * w_dim1]) / d21);
+ a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
+ w_dim1] - w[j + k * w_dim1]) / d21);
+/* L80: */
+ }
+ }
+ a[k + k * a_dim1] = w[k + k * w_dim1];
+ a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+ a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ --j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ --j;
+ if (jp2 != jj && j >= 1) {
+ dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ jj = j + 1;
+ if (jp1 != jj && kstep == 2) {
+ dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j >= 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c
new file mode 100644
index 000000000..c506926af
--- /dev/null
+++ b/relapack/src/dtgsyl.c
@@ -0,0 +1,274 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *,
+ const int *, const double *, const int *, const double *, const int *,
+ double *, const int *, const double *, const int *, const double *,
+ const int *, double *, const int *, double *, double *, double *, int *,
+ int *, int *);
+
+
+/** DTGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's dtgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html
+ * */
+void RELAPACK_dtgsyl(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dif,
+ double *Work, const int *lWork, int *iWork, int *info
+) {
+
+ // Parse arguments
+ const int notran = LAPACK(lsame)(trans, "N");
+ const int tran = LAPACK(lsame)(trans, "T");
+
+ // Compute work buffer size
+ int lwmin = 1;
+ if (notran && (*ijob == 1 || *ijob == 2))
+ lwmin = MAX(1, 2 * *m * *n);
+ *info = 0;
+
+ // Check arguments
+ if (!tran && !notran)
+ *info = -1;
+ else if (notran && (*ijob < 0 || *ijob > 4))
+ *info = -2;
+ else if (*m <= 0)
+ *info = -3;
+ else if (*n <= 0)
+ *info = -4;
+ else if (*ldA < MAX(1, *m))
+ *info = -6;
+ else if (*ldB < MAX(1, *n))
+ *info = -8;
+ else if (*ldC < MAX(1, *m))
+ *info = -10;
+ else if (*ldD < MAX(1, *m))
+ *info = -12;
+ else if (*ldE < MAX(1, *n))
+ *info = -14;
+ else if (*ldF < MAX(1, *m))
+ *info = -16;
+ else if (*lWork < lwmin && *lWork != -1)
+ *info = -20;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DTGSYL", &minfo);
+ return;
+ }
+
+ if (*lWork == -1) {
+ // Work size query
+ *Work = lwmin;
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantrans = notran ? 'N' : 'T';
+
+ // Constant
+ const double ZERO[] = { 0. };
+
+ int isolve = 1;
+ int ifunc = 0;
+ if (notran) {
+ if (*ijob >= 3) {
+ ifunc = *ijob - 2;
+ LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else if (*ijob >= 1)
+ isolve = 2;
+ }
+
+ double scale2;
+ int iround;
+ for (iround = 1; iround <= isolve; iround++) {
+ *scale = 1;
+ double dscale = 0;
+ double dsum = 1;
+ int pq;
+ RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
+ if (dscale != 0) {
+ if (*ijob == 1 || *ijob == 3)
+ *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+ else
+ *dif = sqrt(pq) / (dscale * sqrt(dsum));
+ }
+ if (isolve == 2) {
+ if (iround == 1) {
+ if (notran)
+ ifunc = *ijob;
+ scale2 = *scale;
+ LAPACK(dlacpy)("F", m, n, C, ldC, Work, m);
+ LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m);
+ LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else {
+ LAPACK(dlacpy)("F", m, n, Work, m, C, ldC);
+ LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF);
+ *scale = scale2;
+ }
+ }
+ }
+}
+
+
+/** dtgsyl's recursive vompute kernel */
+static void RELAPACK_dtgsyl_rec(
+ const char *trans, const int *ifunc, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dsum, double *dscale,
+ int *iWork, int *pq, int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) {
+ // Unblocked
+ LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ double scale1[] = { 1. };
+ double scale2[] = { 1. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ int m1 = DREC_SPLIT(*m);
+ if (A[m1 + *ldA * (m1 - 1)])
+ m1++;
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const double *const A_TL = A;
+ const double *const A_TR = A + *ldA * m1;
+ const double *const A_BR = A + *ldA * m1 + m1;
+
+ // C_T
+ // C_B
+ double *const C_T = C;
+ double *const C_B = C + m1;
+
+ // D_TL D_TR
+ // 0 D_BR
+ const double *const D_TL = D;
+ const double *const D_TR = D + *ldD * m1;
+ const double *const D_BR = D + *ldD * m1 + m1;
+
+ // F_T
+ // F_B
+ double *const F_T = F;
+ double *const F_B = F + m1;
+
+ if (*trans == 'N') {
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // F_T = F_T - D_TR * C_B
+ BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+ }
+ } else {
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+ // C_B = C_B - A_TR^H * C_T
+ BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // C_B = C_B - D_TR^H * F_T
+ BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+ }
+ }
+ } else {
+ // Splitting
+ int n1 = DREC_SPLIT(*n);
+ if (B[n1 + *ldB * (n1 - 1)])
+ n1++;
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const double *const B_TL = B;
+ const double *const B_TR = B + *ldB * n1;
+ const double *const B_BR = B + *ldB * n1 + n1;
+
+ // C_L C_R
+ double *const C_L = C;
+ double *const C_R = C + *ldC * n1;
+
+ // E_TL E_TR
+ // 0 E_BR
+ const double *const E_TL = E;
+ const double *const E_TR = E + *ldE * n1;
+ const double *const E_BR = E + *ldE * n1 + n1;
+
+ // F_L F_R
+ double *const F_L = F;
+ double *const F_R = F + *ldF * n1;
+
+ if (*trans == 'N') {
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // C_R = C_R + F_L * B_TR
+ BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+ // F_R = F_R + F_L * E_TR
+ BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+ }
+ } else {
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+ // F_L = F_L + C_R * B_TR
+ BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+ // F_L = F_L + F_R * E_TR
+ BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+ }
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c
new file mode 100644
index 000000000..c87b53ae5
--- /dev/null
+++ b/relapack/src/dtrsyl.c
@@ -0,0 +1,169 @@
+#include "relapack.h"
+
+static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *,
+ const int *, const int *, const double *, const int *, const double *,
+ const int *, double *, const int *, double *, int *);
+
+
+/** DTRSYL solves the real Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's dtrsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html
+ * */
+void RELAPACK_dtrsyl(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+
+ // Check arguments
+ const int notransA = LAPACK(lsame)(tranA, "N");
+ const int transA = LAPACK(lsame)(tranA, "T");
+ const int ctransA = LAPACK(lsame)(tranA, "C");
+ const int notransB = LAPACK(lsame)(tranB, "N");
+ const int transB = LAPACK(lsame)(tranB, "T");
+ const int ctransB = LAPACK(lsame)(tranB, "C");
+ *info = 0;
+ if (!transA && !ctransA && !notransA)
+ *info = -1;
+ else if (!transB && !ctransB && !notransB)
+ *info = -2;
+ else if (*isgn != 1 && *isgn != -1)
+ *info = -3;
+ else if (*m < 0)
+ *info = -4;
+ else if (*n < 0)
+ *info = -5;
+ else if (*ldA < MAX(1, *m))
+ *info = -7;
+ else if (*ldB < MAX(1, *n))
+ *info = -9;
+ else if (*ldC < MAX(1, *m))
+ *info = -11;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DTRSYL", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
+ const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
+
+ // Recursive kernel
+ RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** dtrsyl's recursive compute kernel */
+static void RELAPACK_dtrsyl_rec(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) {
+ // Unblocked
+ RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+ const double MSGN[] = { -*isgn };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ double scale1[] = { 1. };
+ double scale2[] = { 1. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ int m1 = DREC_SPLIT(*m);
+ if (A[m1 + *ldA * (m1 - 1)])
+ m1++;
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const double *const A_TL = A;
+ const double *const A_TR = A + *ldA * m1;
+ const double *const A_BR = A + *ldA * m1 + m1;
+
+ // C_T
+ // C_B
+ double *const C_T = C;
+ double *const C_B = C + m1;
+
+ if (*tranA == 'N') {
+ // recusion(A_BR, B, C_B)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // recusion(A_TL, B, C_T)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ } else {
+ // recusion(A_TL, B, C_T)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+ // C_B = C_B - A_TR' * C_T
+ BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // recusion(A_BR, B, C_B)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+ }
+ } else {
+ // Splitting
+ int n1 = DREC_SPLIT(*n);
+ if (B[n1 + *ldB * (n1 - 1)])
+ n1++;
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const double *const B_TL = B;
+ const double *const B_TR = B + *ldB * n1;
+ const double *const B_BR = B + *ldB * n1 + n1;
+
+ // C_L C_R
+ double *const C_L = C;
+ double *const C_R = C + *ldC * n1;
+
+ if (*tranB == 'N') {
+ // recusion(A, B_TL, C_L)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+ // C_R = C_R -/+ C_L * B_TR
+ BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+ // recusion(A, B_BR, C_R)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ } else {
+ // recusion(A, B_BR, C_R)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+ // C_L = C_L -/+ C_R * B_TR'
+ BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+ // recusion(A, B_TL, C_L)
+ RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c
new file mode 100644
index 000000000..479c7f340
--- /dev/null
+++ b/relapack/src/dtrsyl_rec2.c
@@ -0,0 +1,1034 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static int c_false = FALSE_;
+static int c__2 = 2;
+static double c_b26 = 1.;
+static double c_b30 = 0.;
+static int c_true = TRUE_;
+
+int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int
+ *m, int *n, double *a, int *lda, double *b, int *
+ ldb, double *c__, int *ldc, double *scale, int *info,
+ ftnlen trana_len, ftnlen tranb_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+ i__3, i__4;
+ double d__1, d__2;
+
+ /* Local variables */
+ static int j, k, l;
+ static double x[4] /* was [2][2] */;
+ static int k1, k2, l1, l2;
+ static double a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps,
+ sgn;
+ extern double ddot_(int *, double *, int *, double *,
+ int *);
+ static int ierr;
+ static double smin, suml, sumr;
+ extern /* Subroutine */ int dscal_(int *, double *, double *,
+ int *);
+ extern int lsame_(char *, char *, ftnlen, ftnlen);
+ static int knext, lnext;
+ static double xnorm;
+ extern /* Subroutine */ int dlaln2_(int *, int *, int *,
+ double *, double *, double *, int *, double *,
+ double *, double *, int *, double *, double *
+ , double *, int *, double *, double *, int *),
+ dlasy2_(int *, int *, int *, int *, int *,
+ double *, int *, double *, int *, double *,
+ int *, double *, double *, int *, double *,
+ int *), dlabad_(double *, double *);
+ extern double dlamch_(char *, ftnlen), dlange_(char *, int *,
+ int *, double *, int *, double *, ftnlen);
+ static double scaloc;
+ extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+ static double bignum;
+ static int notrna, notrnb;
+ static double smlnum;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = *ldb;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+ c_dim1 = *ldc;
+ c_offset = 1 + c_dim1;
+ c__ -= c_offset;
+
+ /* Function Body */
+ notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+ notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+ *info = 0;
+ if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_(
+ trana, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -1;
+ } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && !
+ lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -2;
+ } else if (*isgn != 1 && *isgn != -1) {
+ *info = -3;
+ } else if (*m < 0) {
+ *info = -4;
+ } else if (*n < 0) {
+ *info = -5;
+ } else if (*lda < max(1,*m)) {
+ *info = -7;
+ } else if (*ldb < max(1,*n)) {
+ *info = -9;
+ } else if (*ldc < max(1,*m)) {
+ *info = -11;
+ }
+ if (*info != 0) {
+ i__1 = -(*info);
+ xerbla_("DTRSYL", &i__1, (ftnlen)6);
+ return 0;
+ }
+ *scale = 1.;
+ if (*m == 0 || *n == 0) {
+ return 0;
+ }
+ eps = dlamch_("P", (ftnlen)1);
+ smlnum = dlamch_("S", (ftnlen)1);
+ bignum = 1. / smlnum;
+ dlabad_(&smlnum, &bignum);
+ smlnum = smlnum * (double) (*m * *n) / eps;
+ bignum = 1. / smlnum;
+/* Computing MAX */
+ d__1 = smlnum, d__2 = eps * dlange_("M", m, m, &a[a_offset], lda, dum, (
+ ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * dlange_("M", n, n,
+ &b[b_offset], ldb, dum, (ftnlen)1);
+ smin = max(d__1,d__2);
+ sgn = (double) (*isgn);
+ if (notrna && notrnb) {
+ lnext = 1;
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ if (l < lnext) {
+ goto L60;
+ }
+ if (l == *n) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + 1 + l * b_dim1] != 0.) {
+ l1 = l;
+ l2 = l + 1;
+ lnext = l + 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l + 1;
+ }
+ }
+ knext = *m;
+ for (k = *m; k >= 1; --k) {
+ if (k > knext) {
+ goto L50;
+ }
+ if (k == 1) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + (k - 1) * a_dim1] != 0.) {
+ k1 = k - 1;
+ k2 = k;
+ knext = k - 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k - 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__2 = *m - k1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+/* Computing MIN */
+ i__4 = k1 + 1;
+ suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = abs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = abs(vec[0]);
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ d__1 = -sgn * b[l1 + l1 * b_dim1];
+ dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+ * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L20: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__2 = *m - k1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+/* Computing MIN */
+ i__4 = k1 + 1;
+ suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__2 = *m - k1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+/* Computing MIN */
+ i__4 = k1 + 1;
+ suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ d__1 = -sgn * a[k1 + k1 * a_dim1];
+ dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+ b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L30: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ dlasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 +
+ k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec,
+ &c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L50:
+ ;
+ }
+L60:
+ ;
+ }
+ } else if (! notrna && notrnb) {
+ lnext = 1;
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ if (l < lnext) {
+ goto L120;
+ }
+ if (l == *n) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + 1 + l * b_dim1] != 0.) {
+ l1 = l;
+ l2 = l + 1;
+ lnext = l + 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l + 1;
+ }
+ }
+ knext = 1;
+ i__2 = *m;
+ for (k = 1; k <= i__2; ++k) {
+ if (k < knext) {
+ goto L110;
+ }
+ if (k == *m) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + 1 + k * a_dim1] != 0.) {
+ k1 = k;
+ k2 = k + 1;
+ knext = k + 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k + 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = abs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = abs(vec[0]);
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L70: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ d__1 = -sgn * b[l1 + l1 * b_dim1];
+ dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+ a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L80: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ d__1 = -sgn * a[k1 + k1 * a_dim1];
+ dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+ b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L90: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ dlasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1
+ * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+ c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L110:
+ ;
+ }
+L120:
+ ;
+ }
+ } else if (! notrna && ! notrnb) {
+ lnext = *n;
+ for (l = *n; l >= 1; --l) {
+ if (l > lnext) {
+ goto L180;
+ }
+ if (l == 1) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + (l - 1) * b_dim1] != 0.) {
+ l1 = l - 1;
+ l2 = l;
+ lnext = l - 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l - 1;
+ }
+ }
+ knext = 1;
+ i__1 = *m;
+ for (k = 1; k <= i__1; ++k) {
+ if (k < knext) {
+ goto L170;
+ }
+ if (k == *m) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + 1 + k * a_dim1] != 0.) {
+ k1 = k;
+ k2 = k + 1;
+ knext = k + 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k + 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l1;
+/* Computing MIN */
+ i__3 = l1 + 1;
+/* Computing MIN */
+ i__4 = l1 + 1;
+ sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = abs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = abs(vec[0]);
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L130: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ d__1 = -sgn * b[l1 + l1 * b_dim1];
+ dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+ a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L140: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l2 + min(i__4,*n) * b_dim1], ldb);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ d__1 = -sgn * a[k1 + k1 * a_dim1];
+ dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+ * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L150: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l2 + min(i__4,*n) * b_dim1], ldb);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+ &b[l2 + min(i__4,*n) * b_dim1], ldb);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ dlasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 *
+ a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+ c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L160: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L170:
+ ;
+ }
+L180:
+ ;
+ }
+ } else if (notrna && ! notrnb) {
+ lnext = *n;
+ for (l = *n; l >= 1; --l) {
+ if (l > lnext) {
+ goto L240;
+ }
+ if (l == 1) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + (l - 1) * b_dim1] != 0.) {
+ l1 = l - 1;
+ l2 = l;
+ lnext = l - 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l - 1;
+ }
+ }
+ knext = *m;
+ for (k = *m; k >= 1; --k) {
+ if (k > knext) {
+ goto L230;
+ }
+ if (k == 1) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + (k - 1) * a_dim1] != 0.) {
+ k1 = k - 1;
+ k2 = k;
+ knext = k - 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k - 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__1 = *m - k1;
+/* Computing MIN */
+ i__2 = k1 + 1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+ suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l1;
+/* Computing MIN */
+ i__2 = l1 + 1;
+/* Computing MIN */
+ i__3 = l1 + 1;
+ sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = abs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = abs(vec[0]);
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L190: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ d__1 = -sgn * b[l1 + l1 * b_dim1];
+ dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+ * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L200: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__1 = *m - k1;
+/* Computing MIN */
+ i__2 = k1 + 1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+ suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__1 = *m - k1;
+/* Computing MIN */
+ i__2 = k1 + 1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+ suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l2 + min(i__3,*n) * b_dim1], ldb);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ d__1 = -sgn * a[k1 + k1 * a_dim1];
+ dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+ * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L210: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l2 + min(i__3,*n) * b_dim1], ldb);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+ &b[l2 + min(i__3,*n) * b_dim1], ldb);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ dlasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1
+ * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+ c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L220: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L230:
+ ;
+ }
+L240:
+ ;
+ }
+ }
+ return 0;
+}
diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c
new file mode 100644
index 000000000..0462609e9
--- /dev/null
+++ b/relapack/src/dtrtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_dtrtri_rec(const char *, const char *, const int *,
+ double *, const int *, int *);
+
+
+/** DTRTRI computes the inverse of a real upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's dtrtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html
+ * */
+void RELAPACK_dtrtri(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int nounit = LAPACK(lsame)(diag, "N");
+ const int unit = LAPACK(lsame)(diag, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (!nounit && !unit)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("DTRTRI", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleandiag = nounit ? 'N' : 'U';
+
+ // check for singularity
+ if (nounit) {
+ int i;
+ for (i = 0; i < *n; i++)
+ if (A[i + *ldA * i] == 0) {
+ *info = i;
+ return;
+ }
+ }
+
+ // Recursive kernel
+ RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** dtrtri's recursive compute kernel */
+static void RELAPACK_dtrtri_rec(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_DTRTRI, 1)) {
+ // Unblocked
+ LAPACK(dtrti2)(uplo, diag, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+
+ // Splitting
+ const int n1 = DREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + *ldA * n1;
+ double *const A_BL = A + n1;
+ double *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = - A_BL * A_TL
+ BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+ // A_BL = A_BR \ A_BL
+ BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TR = - A_TL * A_TR
+ BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+ // A_TR = A_TR / A_BR
+ BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c
new file mode 100644
index 000000000..5a3452419
--- /dev/null
+++ b/relapack/src/f2c.c
@@ -0,0 +1,109 @@
+#include "stdlib.h"
+#include "stdio.h"
+#include "signal.h"
+#include "f2c.h"
+
+#ifndef SIGIOT
+#ifdef SIGABRT
+#define SIGIOT SIGABRT
+#endif
+#endif
+
+void sig_die(const char *s, int kill) {
+ /* print error message, then clear buffers */
+ fprintf(stderr, "%s\n", s);
+
+ if(kill) {
+ fflush(stderr);
+ /* now get a core */
+ signal(SIGIOT, SIG_DFL);
+ abort();
+ } else
+ exit(1);
+}
+
+void c_div(complex *c, complex *a, complex *b) {
+ double ratio, den;
+ double abr, abi, cr;
+
+ if( (abr = b->r) < 0.)
+ abr = - abr;
+ if( (abi = b->i) < 0.)
+ abi = - abi;
+ if( abr <= abi ) {
+ if(abi == 0) {
+#ifdef IEEE_COMPLEX_DIVIDE
+ float af, bf;
+ af = bf = abr;
+ if (a->i != 0 || a->r != 0)
+ af = 1.;
+ c->i = c->r = af / bf;
+ return;
+#else
+ sig_die("complex division by zero", 1);
+#endif
+ }
+ ratio = (double)b->r / b->i ;
+ den = b->i * (1 + ratio*ratio);
+ cr = (a->r*ratio + a->i) / den;
+ c->i = (a->i*ratio - a->r) / den;
+ } else {
+ ratio = (double)b->i / b->r ;
+ den = b->r * (1 + ratio*ratio);
+ cr = (a->r + a->i*ratio) / den;
+ c->i = (a->i - a->r*ratio) / den;
+ }
+ c->r = cr;
+}
+
+void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) {
+ double ratio, den;
+ double abr, abi, cr;
+
+ if( (abr = b->r) < 0.)
+ abr = - abr;
+ if( (abi = b->i) < 0.)
+ abi = - abi;
+ if( abr <= abi ) {
+ if(abi == 0) {
+#ifdef IEEE_COMPLEX_DIVIDE
+ if (a->i != 0 || a->r != 0)
+ abi = 1.;
+ c->i = c->r = abi / abr;
+ return;
+#else
+ sig_die("complex division by zero", 1);
+#endif
+ }
+ ratio = b->r / b->i ;
+ den = b->i * (1 + ratio*ratio);
+ cr = (a->r*ratio + a->i) / den;
+ c->i = (a->i*ratio - a->r) / den;
+ } else {
+ ratio = b->i / b->r ;
+ den = b->r * (1 + ratio*ratio);
+ cr = (a->r + a->i*ratio) / den;
+ c->i = (a->i - a->r*ratio) / den;
+ }
+ c->r = cr;
+}
+
+float r_imag(complex *z) {
+ return z->i;
+}
+
+void r_cnjg(complex *r, complex *z) {
+ float zi = z->i;
+ r->r = z->r;
+ r->i = -zi;
+}
+
+double d_imag(doublecomplex *z) {
+ return z->i;
+}
+
+void d_cnjg(doublecomplex *r, doublecomplex *z) {
+ double zi = z->i;
+ r->r = z->r;
+ r->i = -zi;
+}
diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h
new file mode 100644
index 000000000..b94ee7c8e
--- /dev/null
+++ b/relapack/src/f2c.h
@@ -0,0 +1,223 @@
+/* f2c.h -- Standard Fortran to C header file */
+
+/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed."
+
+ - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+typedef long int integer;
+typedef unsigned long int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+typedef long int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */
+typedef long long longint; /* system-dependent */
+typedef unsigned long long ulongint; /* system-dependent */
+#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b)))
+#define qbit_set(a,b) ((a) | ((ulongint)1 << (b)))
+#endif
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+#ifdef f2c_i2
+/* for -i2 */
+typedef short flag;
+typedef short ftnlen;
+typedef short ftnint;
+#else
+typedef long int flag;
+typedef long int ftnlen;
+typedef long int ftnint;
+#endif
+
+/*external read, write*/
+typedef struct
+{ flag cierr;
+ ftnint ciunit;
+ flag ciend;
+ char *cifmt;
+ ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{ flag icierr;
+ char *iciunit;
+ flag iciend;
+ char *icifmt;
+ ftnint icirlen;
+ ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{ flag oerr;
+ ftnint ounit;
+ char *ofnm;
+ ftnlen ofnmlen;
+ char *osta;
+ char *oacc;
+ char *ofm;
+ ftnint orl;
+ char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{ flag cerr;
+ ftnint cunit;
+ char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{ flag aerr;
+ ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{ flag inerr;
+ ftnint inunit;
+ char *infile;
+ ftnlen infilen;
+ ftnint *inex; /*parameters in standard's order*/
+ ftnint *inopen;
+ ftnint *innum;
+ ftnint *innamed;
+ char *inname;
+ ftnlen innamlen;
+ char *inacc;
+ ftnlen inacclen;
+ char *inseq;
+ ftnlen inseqlen;
+ char *indir;
+ ftnlen indirlen;
+ char *infmt;
+ ftnlen infmtlen;
+ char *inform;
+ ftnint informlen;
+ char *inunf;
+ ftnlen inunflen;
+ ftnint *inrecl;
+ ftnint *innrec;
+ char *inblank;
+ ftnlen inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype { /* for multiple entry points */
+ integer1 g;
+ shortint h;
+ integer i;
+ /* longint j; */
+ real r;
+ doublereal d;
+ complex c;
+ doublecomplex z;
+ };
+
+typedef union Multitype Multitype;
+
+/*typedef long int Long;*/ /* No longer used; formerly in Namelist */
+
+struct Vardesc { /* for Namelist */
+ char *name;
+ char *addr;
+ ftnlen *dims;
+ int type;
+ };
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+ char *name;
+ Vardesc **vars;
+ int nvars;
+ };
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (doublereal)abs(x)
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (doublereal)min(a,b)
+#define dmax(a,b) (doublereal)max(a,b)
+#define bit_test(a,b) ((a) >> (b) & 1)
+#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b) ((a) | ((uinteger)1 << (b)))
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef int /* Unknown procedure type */ (*U_fp)(...);
+typedef shortint (*J_fp)(...);
+typedef integer (*I_fp)(...);
+typedef real (*R_fp)(...);
+typedef doublereal (*D_fp)(...), (*E_fp)(...);
+typedef /* Complex */ VOID (*C_fp)(...);
+typedef /* Double Complex */ VOID (*Z_fp)(...);
+typedef logical (*L_fp)(...);
+typedef shortlogical (*K_fp)(...);
+typedef /* Character */ VOID (*H_fp)(...);
+typedef /* Subroutine */ int (*S_fp)(...);
+#else
+typedef int /* Unknown procedure type */ (*U_fp)();
+typedef shortint (*J_fp)();
+typedef integer (*I_fp)();
+typedef real (*R_fp)();
+typedef doublereal (*D_fp)(), (*E_fp)();
+typedef /* Complex */ VOID (*C_fp)();
+typedef /* Double Complex */ VOID (*Z_fp)();
+typedef logical (*L_fp)();
+typedef shortlogical (*K_fp)();
+typedef /* Character */ VOID (*H_fp)();
+typedef /* Subroutine */ int (*S_fp)();
+#endif
+/* E_fp is for real functions when -R is not specified */
+typedef VOID C_f; /* complex function */
+typedef VOID H_f; /* character function */
+typedef VOID Z_f; /* double complex function */
+typedef doublereal E_f; /* real function with -R not specified */
+
+/* undef any lower-case symbols that your C compiler predefines, e.g.: */
+
+#ifndef Skip_f2c_Undefs
+#undef cray
+#undef gcos
+#undef mc68010
+#undef mc68020
+#undef mips
+#undef pdp11
+#undef sgi
+#undef sparc
+#undef sun
+#undef sun2
+#undef sun3
+#undef sun4
+#undef u370
+#undef u3b
+#undef u3b2
+#undef u3b5
+#undef unix
+#undef vax
+#endif
+#endif
diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h
new file mode 100644
index 000000000..064276b7e
--- /dev/null
+++ b/relapack/src/lapack.h
@@ -0,0 +1,80 @@
+#ifndef LAPACK_H
+#define LAPACK_H
+
+extern int LAPACK(lsame)(const char *, const char *);
+extern int LAPACK(xerbla)(const char *, const int *);
+
+extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
+extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
+extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *);
+extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *);
+
+extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
+extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
+extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *);
+extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *);
+
+extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
+extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
+extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *);
+extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *);
+
+extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
+extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *);
+
+extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *);
+extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *);
+
+extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+
+extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *);
+extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *);
+extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *);
+extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *);
+
+extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *);
+extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *);
+extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *);
+
+extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
+extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *);
+extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *);
+
+extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *);
+
+extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *);
+
+extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+
+extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *);
+extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *);
+extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *);
+extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *);
+
+#endif /* LAPACK_H */
diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c
new file mode 100644
index 000000000..488547260
--- /dev/null
+++ b/relapack/src/lapack_wrappers.c
@@ -0,0 +1,607 @@
+#include "relapack.h"
+
+////////////
+// XLAUUM //
+////////////
+
+#if INCLUDE_SLAUUM
+void LAPACK(slauum)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_slauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DLAUUM
+void LAPACK(dlauum)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_dlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CLAUUM
+void LAPACK(clauum)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_clauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZLAUUM
+void LAPACK(zlauum)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_zlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XSYGST //
+////////////
+
+#if INCLUDE_SSYGST
+void LAPACK(ssygst)(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_DSYGST
+void LAPACK(dsygst)(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_CHEGST
+void LAPACK(chegst)(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_ZHEGST
+void LAPACK(zhegst)(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+
+////////////
+// XTRTRI //
+////////////
+
+#if INCLUDE_STRTRI
+void LAPACK(strtri)(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_strtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DTRTRI
+void LAPACK(dtrtri)(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CTRTRI
+void LAPACK(ctrtri)(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZTRTRI
+void LAPACK(ztrtri)(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPOTRF //
+////////////
+
+#if INCLUDE_SPOTRF
+void LAPACK(spotrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_spotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DPOTRF
+void LAPACK(dpotrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_dpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CPOTRF
+void LAPACK(cpotrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_cpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZPOTRF
+void LAPACK(zpotrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_zpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPBTRF //
+////////////
+
+#if INCLUDE_SPBTRF
+void LAPACK(spbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_DPBTRF
+void LAPACK(dpbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_CPBTRF
+void LAPACK(cpbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_ZPBTRF
+void LAPACK(zpbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+
+////////////
+// XSYTRF //
+////////////
+
+#if INCLUDE_SSYTRF
+void LAPACK(ssytrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF
+void LAPACK(dsytrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF
+void LAPACK(csytrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF
+void LAPACK(zsytrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF
+void LAPACK(chetrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF
+void LAPACK(zhetrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_SSYTRF_ROOK
+void LAPACK(ssytrf_rook)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF_ROOK
+void LAPACK(dsytrf_rook)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF_ROOK
+void LAPACK(csytrf_rook)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF_ROOK
+void LAPACK(zsytrf_rook)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF_ROOK
+void LAPACK(chetrf_rook)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF_ROOK
+void LAPACK(zhetrf_rook)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+
+////////////
+// XGETRF //
+////////////
+
+#if INCLUDE_SGETRF
+void LAPACK(sgetrf)(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGETRF
+void LAPACK(dgetrf)(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGETRF
+void LAPACK(cgetrf)(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGETRF
+void LAPACK(zgetrf)(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+
+////////////
+// XGBTRF //
+////////////
+
+#if INCLUDE_SGBTRF
+void LAPACK(sgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGBTRF
+void LAPACK(dgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGBTRF
+void LAPACK(cgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGBTRF
+void LAPACK(zgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+
+////////////
+// XTRSYL //
+////////////
+
+#if INCLUDE_STRSYL
+void LAPACK(strsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+ RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_DTRSYL
+void LAPACK(dtrsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+ RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_CTRSYL
+void LAPACK(ctrsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+ RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_ZTRSYL
+void LAPACK(ztrsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+ RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+
+////////////
+// XTGSYL //
+////////////
+
+#if INCLUDE_STGSYL
+void LAPACK(stgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dif,
+ float *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_DTGSYL
+void LAPACK(dtgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dif,
+ double *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_CTGSYL
+void LAPACK(ctgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dif,
+ float *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_ZTGSYL
+void LAPACK(ztgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dif,
+ double *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+
+////////////
+// XGEMMT //
+////////////
+
+#if INCLUDE_SGEMMT
+void LAPACK(sgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+ RELAPACK_sgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DGEMMT
+void LAPACK(dgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+ RELAPACK_dgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CGEMMT
+void LAPACK(cgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+ RELAPACK_cgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZGEMMT
+void LAPACK(zgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+ RELAPACK_zgemmt(uplo, n, A, ldA, info);
+}
+#endif
diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig
new file mode 100644
index 000000000..d89d2fe2f
--- /dev/null
+++ b/relapack/src/lapack_wrappers.c.orig
@@ -0,0 +1,607 @@
+#include "relapack.h"
+
+////////////
+// XLAUUM //
+////////////
+
+#if INCLUDE_SLAUUM
+void LAPACK(slauum)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_slauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DLAUUM
+void LAPACK(dlauum)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_dlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CLAUUM
+void LAPACK(clauum)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_clauum(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZLAUUM
+void LAPACK(zlauum)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_zlauum(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XSYGST //
+////////////
+
+#if INCLUDE_SSYGST
+void LAPACK(ssygst)(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_DSYGST
+void LAPACK(dsygst)(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_CSYGST
+void LAPACK(csygst)(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+#if INCLUDE_ZSYGST
+void LAPACK(zsygst)(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ int *info
+) {
+ RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info);
+}
+#endif
+
+
+////////////
+// XTRTRI //
+////////////
+
+#if INCLUDE_STRTRI
+void LAPACK(strtri)(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_strtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DTRTRI
+void LAPACK(dtrtri)(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_dtrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CTRTRI
+void LAPACK(ctrtri)(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_ctrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZTRTRI
+void LAPACK(ztrtri)(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_ztrtri(uplo, diag, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPOTRF //
+////////////
+
+#if INCLUDE_SPOTRF
+void LAPACK(spotrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_spotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DPOTRF
+void LAPACK(dpotrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_dpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CPOTRF
+void LAPACK(cpotrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_cpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZPOTRF
+void LAPACK(zpotrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+ RELAPACK_zpotrf(uplo, n, A, ldA, info);
+}
+#endif
+
+
+////////////
+// XPBTRF //
+////////////
+
+#if INCLUDE_SPBTRF
+void LAPACK(spbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_DPBTRF
+void LAPACK(dpbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_CPBTRF
+void LAPACK(cpbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+#if INCLUDE_ZPBTRF
+void LAPACK(zpbtrf)(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ int *info
+) {
+ RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info);
+}
+#endif
+
+
+////////////
+// XSYTRF //
+////////////
+
+#if INCLUDE_SSYTRF
+void LAPACK(ssytrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF
+void LAPACK(dsytrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF
+void LAPACK(csytrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF
+void LAPACK(zsytrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF
+void LAPACK(chetrf)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF
+void LAPACK(zhetrf)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_SSYTRF_ROOK
+void LAPACK(ssytrf_rook)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_DSYTRF_ROOK
+void LAPACK(dsytrf_rook)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CSYTRF_ROOK
+void LAPACK(csytrf_rook)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZSYTRF_ROOK
+void LAPACK(zsytrf_rook)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_CHETRF_ROOK
+void LAPACK(chetrf_rook)(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+ RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+#if INCLUDE_ZHETRF_ROOK
+void LAPACK(zhetrf_rook)(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+ RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info);
+}
+#endif
+
+
+////////////
+// XGETRF //
+////////////
+
+#if INCLUDE_SGETRF
+void LAPACK(sgetrf)(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_sgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGETRF
+void LAPACK(dgetrf)(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_dgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGETRF
+void LAPACK(cgetrf)(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_cgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGETRF
+void LAPACK(zgetrf)(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+ RELAPACK_zgetrf(m, n, A, ldA, ipiv, info);
+}
+#endif
+
+
+////////////
+// XGBTRF //
+////////////
+
+#if INCLUDE_SGBTRF
+void LAPACK(sgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_DGBTRF
+void LAPACK(dgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_CGBTRF
+void LAPACK(cgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+#if INCLUDE_ZGBTRF
+void LAPACK(zgbtrf)(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+ RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info);
+}
+#endif
+
+
+////////////
+// XTRSYL //
+////////////
+
+#if INCLUDE_STRSYL
+void LAPACK(strsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+ RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_DTRSYL
+void LAPACK(dtrsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+ RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_CTRSYL
+void LAPACK(ctrsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+ RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+#if INCLUDE_ZTRSYL
+void LAPACK(ztrsyl)(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+ RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+#endif
+
+
+////////////
+// XTGSYL //
+////////////
+
+#if INCLUDE_STGSYL
+void LAPACK(stgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dif,
+ float *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_DTGSYL
+void LAPACK(dtgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dif,
+ double *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_CTGSYL
+void LAPACK(ctgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dif,
+ float *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+#if INCLUDE_ZTGSYL
+void LAPACK(ztgsyl)(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dif,
+ double *Work, const int *lWork, int *iWork, int *info
+) {
+ RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info);
+}
+#endif
+
+
+////////////
+// XGEMMT //
+////////////
+
+#if INCLUDE_SGEMMT
+void LAPACK(sgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+ RELAPACK_sgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_DGEMMT
+void LAPACK(dgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+ RELAPACK_dgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_CGEMMT
+void LAPACK(cgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+ RELAPACK_cgemmt(uplo, n, A, ldA, info);
+}
+#endif
+
+#if INCLUDE_ZGEMMT
+void LAPACK(zgemmt)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+ RELAPACK_zgemmt(uplo, n, A, ldA, info);
+}
+#endif
diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h
new file mode 100644
index 000000000..2cb061c32
--- /dev/null
+++ b/relapack/src/relapack.h
@@ -0,0 +1,60 @@
+#ifndef RELAPACK_INT_H
+#define RELAPACK_INT_H
+
+#include "../config.h"
+
+#include "../inc/relapack.h"
+
+// add an underscore to BLAS routines (or not)
+#if BLAS_UNDERSCORE
+#define BLAS(routine) routine ## _
+#else
+#define BLAS(routine) routine
+#endif
+
+// add an underscore to LAPACK routines (or not)
+#if LAPACK_UNDERSCORE
+#define LAPACK(routine) routine ## _
+#else
+#define LAPACK(routine) routine
+#endif
+
+// minimum and maximum macros
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+// REC_SPLIT(n) returns how a problem of size n is split recursively.
+// If n >= 16, we ensure that the size of at least one of the halves is
+// divisible by 8 (the cache line size in most CPUs), while both halves are
+// still as close as possible in size.
+// If n < 16 the problem is simply split in the middle. (Note that the
+// crossoversize is usually larger than 16.)
+#define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2)
+#define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
+#define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2)
+#define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2)
+
+#include "lapack.h"
+#include "blas.h"
+
+// sytrf helper routines
+void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *);
+void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *);
+
+// trsyl helper routines
+void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+
+#endif /* RELAPACK_INT_H */
diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c
new file mode 100644
index 000000000..bc20e744b
--- /dev/null
+++ b/relapack/src/sgbtrf.c
@@ -0,0 +1,227 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *,
+ const int *, float *, const int *, int *, float *, const int *, float *,
+ const int *, int *);
+
+
+/** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's sgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html
+ * */
+void RELAPACK_sgbtrf(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kl < 0)
+ *info = -3;
+ else if (*ku < 0)
+ *info = -4;
+ else if (*ldAb < 2 * *kl + *ku + 1)
+ *info = -6;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SGBTRF", &minfo);
+ return;
+ }
+
+ // Constant
+ const float ZERO[] = { 0. };
+
+ // Result upper band width
+ const int kv = *ku + *kl;
+
+ // Unskewg A
+ const int ldA[] = { *ldAb - 1 };
+ float *const A = Ab + kv;
+
+ // Zero upper diagonal fill-in elements
+ int i, j;
+ for (j = 0; j < *n; j++) {
+ float *const A_j = A + *ldA * j;
+ for (i = MAX(0, j - kv); i < j - *ku; i++)
+ A_j[i] = 0.;
+ }
+
+ // Allocate work space
+ const int n1 = SREC_SPLIT(*n);
+ const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+ const int nWorkl = (kv > n1) ? n1 : kv;
+ const int mWorku = (*kl > n1) ? n1 : *kl;
+ const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+ float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
+ float *Worku = malloc(mWorku * nWorku * sizeof(float));
+ LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+ LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+ // Recursive kernel
+ RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+ // Free work space
+ free(Workl);
+ free(Worku);
+}
+
+
+/** sgbtrf's recursive compute kernel */
+static void RELAPACK_sgbtrf_rec(
+ const int *m, const int *n, const int *kl, const int *ku,
+ float *Ab, const int *ldAb, int *ipiv,
+ float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
+ // Unblocked
+ LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Loop iterators
+ int i, j;
+
+ // Output upper band width
+ const int kv = *ku + *kl;
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ float *const A = Ab + kv;
+
+ // Splitting
+ const int n1 = MIN(SREC_SPLIT(*n), *kl);
+ const int n2 = *n - n1;
+ const int m1 = MIN(n1, *m);
+ const int m2 = *m - m1;
+ const int mn1 = MIN(m1, n1);
+ const int mn2 = MIN(m2, n2);
+
+ // Ab_L *
+ // Ab_BR
+ float *const Ab_L = Ab;
+ float *const Ab_BR = Ab + *ldAb * n1;
+
+ // A_L A_R
+ float *const A_L = A;
+ float *const A_R = A + *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + *ldA * n1;
+ float *const A_BL = A + m1;
+ float *const A_BR = A + *ldA * n1 + m1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // Banded splitting
+ const int n21 = MIN(n2, kv - n1);
+ const int n22 = MIN(n2 - n21, n1);
+ const int m21 = MIN(m2, *kl - m1);
+ const int m22 = MIN(m2 - m21, m1);
+
+ // n1 n21 n22
+ // m * A_Rl ARr
+ float *const A_Rl = A_R;
+ float *const A_Rr = A_R + *ldA * n21;
+
+ // n1 n21 n22
+ // m1 * A_TRl A_TRr
+ // m21 A_BLt A_BRtl A_BRtr
+ // m22 A_BLb A_BRbl A_BRbr
+ float *const A_TRl = A_TR;
+ float *const A_TRr = A_TR + *ldA * n21;
+ float *const A_BLt = A_BL;
+ float *const A_BLb = A_BL + m21;
+ float *const A_BRtl = A_BR;
+ float *const A_BRtr = A_BR + *ldA * n21;
+ float *const A_BRbl = A_BR + m21;
+ float *const A_BRbr = A_BR + *ldA * n21 + m21;
+
+ // recursion(Ab_L, ipiv_T)
+ RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+ // Workl = A_BLb
+ LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+ // partially redo swaps in A_L
+ for (i = 0; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+ else
+ BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+ }
+ }
+
+ // apply pivots to A_Rl
+ LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+ // apply pivots to A_Rr columnwise
+ for (j = 0; j < n22; j++) {
+ float *const A_Rrj = A_Rr + *ldA * j;
+ for (i = j; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ const float tmp = A_Rrj[i];
+ A_Rrj[i] = A_Rr[ip];
+ A_Rrj[ip] = tmp;
+ }
+ }
+ }
+
+ // A_TRl = A_TL \ A_TRl
+ BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // Worku = A_TRr
+ LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+ // Worku = A_TL \ Worku
+ BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+ // A_TRr = Worku
+ LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_TRl
+ BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // A_BRbl = A_BRbl - Workl * A_TRl
+ BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+ // A_BRtr = A_BRtr - A_BLt * Worku
+ BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Workl * Worku
+ BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+ // partially undo swaps in A_L
+ for (i = mn1 - 1; i >= 0; i--) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA);
+ else
+ BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
+ }
+ }
+
+ // recursion(Ab_BR, ipiv_B)
+ RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ if (*info)
+ *info += n1;
+ // shift pivots
+ for (i = 0; i < mn2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c
new file mode 100644
index 000000000..75f78fabd
--- /dev/null
+++ b/relapack/src/sgemmt.c
@@ -0,0 +1,165 @@
+#include "relapack.h"
+
+static void RELAPACK_sgemmt_rec(const char *, const char *, const char *,
+ const int *, const int *, const float *, const float *, const int *,
+ const float *, const int *, const float *, float *, const int *);
+
+static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *,
+ const int *, const int *, const float *, const float *, const int *,
+ const float *, const int *, const float *, float *, const int *);
+
+
+/** SGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_sgemmt(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+ BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+#else
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int notransA = LAPACK(lsame)(transA, "N");
+ const int tranA = LAPACK(lsame)(transA, "T");
+ const int notransB = LAPACK(lsame)(transB, "N");
+ const int tranB = LAPACK(lsame)(transB, "T");
+ int info = 0;
+ if (!lower && !upper)
+ info = 1;
+ else if (!tranA && !notransA)
+ info = 2;
+ else if (!tranB && !notransB)
+ info = 3;
+ else if (*n < 0)
+ info = 4;
+ else if (*k < 0)
+ info = 5;
+ else if (*ldA < MAX(1, notransA ? *n : *k))
+ info = 8;
+ else if (*ldB < MAX(1, notransB ? *k : *n))
+ info = 10;
+ else if (*ldC < MAX(1, *n))
+ info = 13;
+ if (info) {
+ LAPACK(xerbla)("SGEMMT", &info);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleantransA = notransA ? 'N' : 'T';
+ const char cleantransB = notransB ? 'N' : 'T';
+
+ // Recursive kernel
+ RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** sgemmt's recursive compute kernel */
+static void RELAPACK_sgemmt_rec(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+
+ if (*n <= MAX(CROSSOVER_SGEMMT, 1)) {
+ // Unblocked
+ RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+ }
+
+ // Splitting
+ const int n1 = SREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_T
+ // A_B
+ const float *const A_T = A;
+ const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);
+
+ // B_L B_R
+ const float *const B_L = B;
+ const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);
+
+ // C_TL C_TR
+ // C_BL C_BR
+ float *const C_TL = C;
+ float *const C_TR = C + *ldC * n1;
+ float *const C_BL = C + n1;
+ float *const C_BR = C + *ldC * n1 + n1;
+
+ // recursion(C_TL)
+ RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+ if (*uplo == 'L')
+ // C_BL = alpha A_B B_L + beta C_BL
+ BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+ else
+ // C_TR = alpha A_T B_R + beta C_TR
+ BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+ // recursion(C_BR)
+ RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** sgemmt's unblocked compute kernel */
+static void RELAPACK_sgemmt_rec2(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const float *alpha, const float *A, const int *ldA,
+ const float *B, const int *ldB,
+ const float *beta, float *C, const int *ldC
+) {
+
+ const int incB = (*transB == 'N') ? 1 : *ldB;
+ const int incC = 1;
+
+ int i;
+ for (i = 0; i < *n; i++) {
+ // A_0
+ // A_i
+ const float *const A_0 = A;
+ const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i);
+
+ // * B_i *
+ const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i);
+
+ // * C_0i *
+ // * C_ii *
+ float *const C_0i = C + *ldC * i;
+ float *const C_ii = C + *ldC * i + i;
+
+ if (*uplo == 'L') {
+ const int nmi = *n - i;
+ if (*transA == 'N')
+ BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ else
+ BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ } else {
+ const int ip1 = i + 1;
+ if (*transA == 'N')
+ BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ else
+ BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ }
+ }
+}
diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c
new file mode 100644
index 000000000..284f8cff6
--- /dev/null
+++ b/relapack/src/sgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *,
+ int *, int *);
+
+
+/** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's sgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html
+ * */
+void RELAPACK_sgetrf(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SGETRF", &minfo);
+ return;
+ }
+
+ const int sn = MIN(*m, *n);
+
+ RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+ // Right remainder
+ if (*m < *n) {
+ // Constants
+ const float ONE[] = { 1. };
+ const int iONE[] = { 1. };
+
+ // Splitting
+ const int rn = *n - *m;
+
+ // A_L A_R
+ const float *const A_L = A;
+ float *const A_R = A + *ldA * *m;
+
+ // A_R = apply(ipiv, A_R)
+ LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+ // A_R = A_L \ A_R
+ BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+ }
+}
+
+
+/** sgetrf's recursive compute kernel */
+static void RELAPACK_sgetrf_rec(
+ const int *m, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
+ // Unblocked
+ LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Splitting
+ const int n1 = SREC_SPLIT(*n);
+ const int n2 = *n - n1;
+ const int m2 = *m - n1;
+
+ // A_L A_R
+ float *const A_L = A;
+ float *const A_R = A + *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + *ldA * n1;
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // recursion(A_L, ipiv_T)
+ RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+ // apply pivots to A_R
+ LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+ // A_TR = A_TL \ A_TR
+ BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_BL * A_TR
+ BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+ // recursion(A_BR, ipiv_B)
+ RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+ if (*info)
+ *info += n1;
+ // apply pivots to A_BL
+ LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c
new file mode 100644
index 000000000..280f141b3
--- /dev/null
+++ b/relapack/src/slauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_slauum_rec(const char *, const int *, float *,
+ const int *, int *);
+
+
+/** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's slauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html
+ * */
+void RELAPACK_slauum(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SLAUUM", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** slauum's recursive compute kernel */
+static void RELAPACK_slauum_rec(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_SLAUUM, 1)) {
+ // Unblocked
+ LAPACK(slauu2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+
+ // Splitting
+ const int n1 = SREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + *ldA * n1;
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info);
+
+ if (*uplo == 'L') {
+ // A_TL = A_TL + A_BL' * A_BL
+ BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+ // A_BL = A_BR' * A_BL
+ BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TL = A_TL + A_TR * A_TR'
+ BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+ // A_TR = A_TR * A_BR'
+ BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c
new file mode 100644
index 000000000..ee0a5546e
--- /dev/null
+++ b/relapack/src/spbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_spbtrf_rec(const char *, const int *, const int *,
+ float *, const int *, float *, const int *, int *);
+
+
+/** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's spbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html
+ * */
+void RELAPACK_spbtrf(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kd < 0)
+ *info = -3;
+ else if (*ldAb < *kd + 1)
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SPBTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Constant
+ const float ZERO[] = { 0. };
+
+ // Allocate work space
+ const int n1 = SREC_SPLIT(*n);
+ const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+ const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+ float *Work = malloc(mWork * nWork * sizeof(float));
+ LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+ // Recursive kernel
+ RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+ // Free work space
+ free(Work);
+}
+
+
+/** spbtrf's recursive compute kernel */
+static void RELAPACK_spbtrf_rec(
+ const char *uplo, const int *n, const int *kd,
+ float *Ab, const int *ldAb,
+ float *Work, const int *ldWork,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_SPBTRF, 1)) {
+ // Unblocked
+ LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ float *const A = Ab + ((*uplo == 'L') ? 0 : *kd);
+
+ // Splitting
+ const int n1 = MIN(SREC_SPLIT(*n), *kd);
+ const int n2 = *n - n1;
+
+ // * *
+ // * Ab_BR
+ float *const Ab_BR = Ab + *ldAb * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + *ldA * n1;
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ // Banded splitting
+ const int n21 = MIN(n2, *kd - n1);
+ const int n22 = MIN(n2 - n21, *kd);
+
+ // n1 n21 n22
+ // n1 * A_TRl A_TRr
+ // n21 A_BLt A_BRtl A_BRtr
+ // n22 A_BLb A_BRbl A_BRbr
+ float *const A_TRl = A_TR;
+ float *const A_TRr = A_TR + *ldA * n21;
+ float *const A_BLt = A_BL;
+ float *const A_BLb = A_BL + n21;
+ float *const A_BRtl = A_BR;
+ float *const A_BRtr = A_BR + *ldA * n21;
+ float *const A_BRbl = A_BR + n21;
+ float *const A_BRbr = A_BR + *ldA * n21 + n21;
+
+ if (*uplo == 'L') {
+ // A_BLt = ABLt / A_TL'
+ BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_BLt'
+ BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+ // Work = A_BLb
+ LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+ // Work = Work / A_TL'
+ BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRbl = A_BRbl - Work * A_BLt'
+ BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+ // A_BRbr = A_BRbr - Work * Work'
+ BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_BLb = Work
+ LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+ } else {
+ // A_TRl = A_TL' \ A_TRl
+ BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // A_BRtl = A_BRtl - A_TRl' * A_TRl
+ BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // Work = A_TRr
+ LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+ // Work = A_TL' \ Work
+ BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRtr = A_BRtr - A_TRl' * Work
+ BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Work' * Work
+ BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_TRr = Work
+ LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+ }
+
+ // recursion(A_BR)
+ if (*kd > n1)
+ RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info);
+ else
+ RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c
new file mode 100644
index 000000000..2a609321b
--- /dev/null
+++ b/relapack/src/spotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_spotrf_rec(const char *, const int *, float *,
+ const int *, int *);
+
+
+/** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's spotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html
+ * */
+void RELAPACK_spotrf(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SPOTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** spotrf's recursive compute kernel */
+static void RELAPACK_spotrf_rec(
+ const char *uplo, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_SPOTRF, 1)) {
+ // Unblocked
+ LAPACK(spotf2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+
+ // Splitting
+ const int n1 = SREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + *ldA * n1;
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = A_BL / A_TL'
+ BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+ // A_BR = A_BR - A_BL * A_BL'
+ BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+ } else {
+ // A_TR = A_TL' \ A_TR
+ BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * A_TR
+ BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c
new file mode 100644
index 000000000..7f145cdec
--- /dev/null
+++ b/relapack/src/ssygst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_ssygst_rec(const int *, const char *, const int *,
+ float *, const int *, const float *, const int *,
+ float *, const int *, int *);
+
+
+/** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's ssygst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html
+ * */
+void RELAPACK_ssygst(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (*itype < 1 || *itype > 3)
+ *info = -1;
+ else if (!lower && !upper)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ else if (*ldB < MAX(1, *n))
+ *info = -7;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SSYGST", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Allocate work space
+ float *Work = NULL;
+ int lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+ const int n1 = SREC_SPLIT(*n);
+ lWork = n1 * (*n - n1);
+ Work = malloc(lWork * sizeof(float));
+ if (!Work)
+ lWork = 0;
+#endif
+
+ // Recursive kernel
+ RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+ // Free work space
+#if XSYGST_ALLOW_MALLOC
+ if (Work)
+ free(Work);
+#endif
+}
+
+
+/** ssygst's recursive compute kernel */
+static void RELAPACK_ssygst_rec(
+ const int *itype, const char *uplo, const int *n,
+ float *A, const int *ldA, const float *B, const int *ldB,
+ float *Work, const int *lWork, int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_SSYGST, 1)) {
+ // Unblocked
+ LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info);
+ return;
+ }
+
+ // Constants
+ const float ZERO[] = { 0. };
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+ const float HALF[] = { .5 };
+ const float MHALF[] = { -.5 };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ // Splitting
+ const int n1 = SREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + *ldA * n1;
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+
+ // B_TL B_TR
+ // B_BL B_BR
+ const float *const B_TL = B;
+ const float *const B_TR = B + *ldB * n1;
+ const float *const B_BL = B + n1;
+ const float *const B_BR = B + *ldB * n1 + n1;
+
+ // recursion(A_TL, B_TL)
+ RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+ if (*itype == 1)
+ if (*uplo == 'L') {
+ // A_BL = A_BL / B_TL'
+ BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork > n2 * n1) {
+ // T = -1/2 * B_BL * A_TL
+ BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ } else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+ BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+ if (*lWork > n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR \ A_BL
+ BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL' \ A_TR
+ BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork > n2 * n1) {
+ // T = -1/2 * A_TL * B_TR
+ BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ } else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+ BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+ if (*lWork > n2 * n1)
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR / B_BR
+ BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+ else
+ if (*uplo == 'L') {
+ // A_BL = A_BL * B_TL
+ BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork > n2 * n1) {
+ // T = 1/2 * A_BR * B_BL
+ BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ } else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+ BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+ if (*lWork > n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE);
+ else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR * A_BL
+ BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL * A_TR
+ BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork > n2 * n1) {
+ // T = 1/2 * B_TR * A_BR
+ BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ } else
+ // A_TR = A_TR + 1/2 B_TR A_BR
+ BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+ BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+ if (*lWork > n2 * n1)
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE);
+ else
+ // A_TR = A_TR + 1/2 B_TR * A_BR
+ BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR * B_BR
+ BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+
+ // recursion(A_BR, B_BR)
+ RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c
new file mode 100644
index 000000000..8a4fad9f2
--- /dev/null
+++ b/relapack/src/ssytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *,
+ float *, const int *, int *, float *, const int *, int *);
+
+
+/** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's ssytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html
+ * */
+void RELAPACK_ssytrf(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * sizeof(float));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy arguments
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** ssytrf's recursive compute kernel */
+static void RELAPACK_ssytrf_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_SSYTRF, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = SREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ float *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+ float *const A_BL_B = A + *n;
+ float *const A_BR_B = A + *ldA * n1 + *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ float *const Work_BL = Work + n1;
+ float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+ // last row of A_BL
+ float *const A_BL_b = A_BL + n2_out;
+
+ // last row of Work_BL
+ float *const Work_BL_b = Work_BL + n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = SREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ float *const Work_R = top ? Work : Work + *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ float *const A_TL_T = A + *ldA * n_rest;
+ float *const A_TR_T = A + *ldA * (n_rest + n1);
+ float *const A_TL = A + *ldA * n_rest + n_rest;
+ float *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ float *const Work_L = Work;
+ float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c
new file mode 100644
index 000000000..edc9269ec
--- /dev/null
+++ b/relapack/src/ssytrf_rec2.c
@@ -0,0 +1,351 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static float c_b8 = -1.f;
+static float c_b9 = 1.f;
+
+/** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's slasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int *
+ nb, int *kb, float *a, int *lda, int *ipiv, float *w,
+ int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+ float r__1, r__2, r__3;
+
+ /* Builtin functions */
+ double sqrt(double);
+
+ /* Local variables */
+ static int j, k;
+ static float t, r1, d11, d21, d22;
+ static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+ static float alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int sscal_(int *, float *, float *, int *),
+ sgemv_(char *, int *, int *, float *, float *, int *,
+ float *, int *, float *, float *, int *, ftnlen);
+ static int kstep;
+ extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
+ int *), sswap_(int *, float *, int *, float *, int *
+ );
+ static float absakk;
+ extern int isamax_(int *, float *, int *);
+ static float colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.f) + 1.f) / 8.f;
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ kstep = 1;
+ absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ i__1 = k - imax;
+ jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+ &c__1);
+ rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
+ if (imax > 1) {
+ i__1 = imax - 1;
+ jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+ r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1],
+ dabs(r__1));
+ rowmax = dmax(r__2,r__3);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >=
+ alpha * rowmax) {
+ kp = imax;
+ scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kp != kk) {
+ a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+ i__1 = kk - 1 - kp;
+ scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ r1 = 1.f / a[k + k * a_dim1];
+ i__1 = k - 1;
+ sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (k > 2) {
+ d21 = w[k - 1 + kw * w_dim1];
+ d11 = w[k + kw * w_dim1] / d21;
+ d22 = w[k - 1 + (kw - 1) * w_dim1] / d21;
+ t = 1.f / (d11 * d22 - 1.f);
+ d21 = t / d21;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1)
+ * w_dim1] - w[j + kw * w_dim1]);
+ a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] -
+ w[j + (kw - 1) * w_dim1]);
+/* L20: */
+ }
+ }
+ a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+ a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+ a[k + k * a_dim1] = w[k + kw * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ ++j;
+ }
+ ++j;
+ if (jp != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+ }
+ if (j < *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k
+ + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+ kstep = 1;
+ absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - k;
+ scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1],
+ lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) *
+ w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax - k;
+ jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+ ;
+ rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
+ if (imax < *n) {
+ i__1 = *n - imax;
+ jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+/* Computing MAX */
+ r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1],
+ dabs(r__1));
+ rowmax = dmax(r__2,r__3);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >=
+ alpha * rowmax) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kp != kk) {
+ a[kp + kp * a_dim1] = a[kk + kk * a_dim1];
+ i__1 = kp - kk - 1;
+ scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ r1 = 1.f / a[k + k * a_dim1];
+ i__1 = *n - k;
+ sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ d21 = w[k + 1 + k * w_dim1];
+ d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+ d22 = w[k + k * w_dim1] / d21;
+ t = 1.f / (d11 * d22 - 1.f);
+ d21 = t / d21;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] -
+ w[j + (k + 1) * w_dim1]);
+ a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) *
+ w_dim1] - w[j + k * w_dim1]);
+/* L80: */
+ }
+ }
+ a[k + k * a_dim1] = w[k + k * w_dim1];
+ a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+ a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ --j;
+ }
+ --j;
+ if (jp != jj && j >= 1) {
+ sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j > 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c
new file mode 100644
index 000000000..040df2484
--- /dev/null
+++ b/relapack/src/ssytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *,
+ float *, const int *, int *, float *, const int *, int *);
+
+
+/** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's ssytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html
+ * */
+void RELAPACK_ssytrf_rook(
+ const char *uplo, const int *n,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ float *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * sizeof(float));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("SSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** ssytrf_rook's recursive compute kernel */
+static void RELAPACK_ssytrf_rook_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ float *A, const int *ldA, int *ipiv,
+ float *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = SREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ float *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+ float *const A_BL_B = A + *n;
+ float *const A_BR_B = A + *ldA * n1 + *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ float *const Work_BL = Work + n1;
+ float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ float *const A_BR_r = A_BR + *ldA * n2_out + n2_out;
+
+ // last row of A_BL
+ float *const A_BL_b = A_BL + n2_out;
+
+ // last row of Work_BL
+ float *const Work_BL_b = Work_BL + n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = SREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ float *const Work_R = top ? Work : Work + *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ float *const A_TL_T = A + *ldA * n_rest;
+ float *const A_TR_T = A + *ldA * (n_rest + n1);
+ float *const A_TL = A + *ldA * n_rest + n_rest;
+ float *const A_TR = A + *ldA * (n_rest + n1) + n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ float *const Work_L = Work;
+ float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c
new file mode 100644
index 000000000..3308826d7
--- /dev/null
+++ b/relapack/src/ssytrf_rook_rec2.c
@@ -0,0 +1,451 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static float c_b9 = -1.f;
+static float c_b10 = 1.f;
+
+/** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's slasyf_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n,
+ int *nb, int *kb, float *a, int *lda, int *ipiv, float *
+ w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2;
+ float r__1;
+
+ /* Builtin functions */
+ double sqrt(double);
+
+ /* Local variables */
+ static int j, k, p;
+ static float t, r1, d11, d12, d21, d22;
+ static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+ static logical done;
+ static int imax, jmax;
+ static float alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
+ static float sfmin;
+ static int itemp;
+ extern /* Subroutine */ int sgemv_(char *, int *, int *, float *,
+ float *, int *, float *, int *, float *, float *, int *,
+ ftnlen);
+ static int kstep;
+ static float stemp;
+ extern /* Subroutine */ int scopy_(int *, float *, int *, float *,
+ int *), sswap_(int *, float *, int *, float *, int *
+ );
+ static float absakk;
+ extern double slamch_(char *, ftnlen);
+ extern int isamax_(int *, float *, int *);
+ static float colmax, rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.f) + 1.f) / 8.f;
+ sfmin = slamch_("S", (ftnlen)1);
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ p = k;
+ scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L12:
+ scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = k - imax;
+ jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) *
+ w_dim1], &c__1);
+ rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1));
+ } else {
+ rowmax = 0.f;
+ }
+ if (imax > 1) {
+ i__1 = imax - 1;
+ itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1));
+ if (stemp > rowmax) {
+ rowmax = stemp;
+ jmax = itemp;
+ }
+ }
+ if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) <
+ alpha * rowmax)) {
+ kp = imax;
+ scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ }
+ if (! done) {
+ goto L12;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kstep == 2 && p != k) {
+ i__1 = k - p;
+ scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+ a_dim1], lda);
+ scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - k + 1;
+ sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+ ldw);
+ }
+ if (kp != kk) {
+ a[kp + k * a_dim1] = a[kk + k * a_dim1];
+ i__1 = k - 1 - kp;
+ scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - kk + 1;
+ sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
+ r1 = 1.f / a[k + k * a_dim1];
+ i__1 = k - 1;
+ sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else if (a[k + k * a_dim1] != 0.f) {
+ i__1 = k - 1;
+ for (ii = 1; ii <= i__1; ++ii) {
+ a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L14: */
+ }
+ }
+ }
+ } else {
+ if (k > 2) {
+ d12 = w[k - 1 + kw * w_dim1];
+ d11 = w[k + kw * w_dim1] / d12;
+ d22 = w[k - 1 + (kw - 1) * w_dim1] / d12;
+ t = 1.f / (d11 * d22 - 1.f);
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) *
+ w_dim1] - w[j + kw * w_dim1]) / d12);
+ a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] -
+ w[j + (kw - 1) * w_dim1]) / d12);
+/* L20: */
+ }
+ }
+ a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1];
+ a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1];
+ a[k + k * a_dim1] = w[k + kw * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ ++j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ ++j;
+ if (jp2 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ jj = j - 1;
+ if (jp1 != jj && kstep == 2) {
+ i__1 = *n - j + 1;
+ sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ if (j <= *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ p = k;
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, &
+ w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, (
+ ftnlen)12);
+ }
+ absakk = (r__1 = w[k + k * w_dim1], dabs(r__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1));
+ } else {
+ colmax = 0.f;
+ }
+ if (dmax(absakk,colmax) == 0.f) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L72:
+ i__1 = imax - k;
+ scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1]
+ , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k
+ + 1) * w_dim1], &c__1, (ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = imax - k;
+ jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+ c__1);
+ rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1));
+ } else {
+ rowmax = 0.f;
+ }
+ if (imax < *n) {
+ i__1 = *n - imax;
+ itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+ stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1));
+ if (stemp > rowmax) {
+ rowmax = stemp;
+ jmax = itemp;
+ }
+ }
+ if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) <
+ alpha * rowmax)) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ }
+ if (! done) {
+ goto L72;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kstep == 2 && p != k) {
+ i__1 = p - k;
+ scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - p + 1;
+ scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+ c__1);
+ sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+ sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+ }
+ if (kp != kk) {
+ a[kp + k * a_dim1] = a[kk + k * a_dim1];
+ i__1 = kp - k - 1;
+ scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+ * a_dim1], lda);
+ i__1 = *n - kp + 1;
+ scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+ a_dim1], &c__1);
+ sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) {
+ r1 = 1.f / a[k + k * a_dim1];
+ i__1 = *n - k;
+ sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ } else if (a[k + k * a_dim1] != 0.f) {
+ i__1 = *n;
+ for (ii = k + 1; ii <= i__1; ++ii) {
+ a[ii + k * a_dim1] /= a[k + k * a_dim1];
+/* L74: */
+ }
+ }
+ }
+ } else {
+ if (k < *n - 1) {
+ d21 = w[k + 1 + k * w_dim1];
+ d11 = w[k + 1 + (k + 1) * w_dim1] / d21;
+ d22 = w[k + k * w_dim1] / d21;
+ t = 1.f / (d11 * d22 - 1.f);
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[
+ j + (k + 1) * w_dim1]) / d21);
+ a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) *
+ w_dim1] - w[j + k * w_dim1]) / d21);
+/* L80: */
+ }
+ }
+ a[k + k * a_dim1] = w[k + k * w_dim1];
+ a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1];
+ a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1];
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ --j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ --j;
+ if (jp2 != jj && j >= 1) {
+ sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ jj = j + 1;
+ if (jp1 != jj && kstep == 2) {
+ sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j >= 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c
new file mode 100644
index 000000000..1870fb928
--- /dev/null
+++ b/relapack/src/stgsyl.c
@@ -0,0 +1,274 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_stgsyl_rec(const char *, const int *, const int *,
+ const int *, const float *, const int *, const float *, const int *,
+ float *, const int *, const float *, const int *, const float *,
+ const int *, float *, const int *, float *, float *, float *, int *, int *,
+ int *);
+
+
+/** STGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's stgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html
+ * */
+void RELAPACK_stgsyl(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dif,
+ float *Work, const int *lWork, int *iWork, int *info
+) {
+
+ // Parse arguments
+ const int notran = LAPACK(lsame)(trans, "N");
+ const int tran = LAPACK(lsame)(trans, "T");
+
+ // Compute work buffer size
+ int lwmin = 1;
+ if (notran && (*ijob == 1 || *ijob == 2))
+ lwmin = MAX(1, 2 * *m * *n);
+ *info = 0;
+
+ // Check arguments
+ if (!tran && !notran)
+ *info = -1;
+ else if (notran && (*ijob < 0 || *ijob > 4))
+ *info = -2;
+ else if (*m <= 0)
+ *info = -3;
+ else if (*n <= 0)
+ *info = -4;
+ else if (*ldA < MAX(1, *m))
+ *info = -6;
+ else if (*ldB < MAX(1, *n))
+ *info = -8;
+ else if (*ldC < MAX(1, *m))
+ *info = -10;
+ else if (*ldD < MAX(1, *m))
+ *info = -12;
+ else if (*ldE < MAX(1, *n))
+ *info = -14;
+ else if (*ldF < MAX(1, *m))
+ *info = -16;
+ else if (*lWork < lwmin && *lWork != -1)
+ *info = -20;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("STGSYL", &minfo);
+ return;
+ }
+
+ if (*lWork == -1) {
+ // Work size query
+ *Work = lwmin;
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantrans = notran ? 'N' : 'T';
+
+ // Constant
+ const float ZERO[] = { 0. };
+
+ int isolve = 1;
+ int ifunc = 0;
+ if (notran) {
+ if (*ijob >= 3) {
+ ifunc = *ijob - 2;
+ LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else if (*ijob >= 1)
+ isolve = 2;
+ }
+
+ float scale2;
+ int iround;
+ for (iround = 1; iround <= isolve; iround++) {
+ *scale = 1;
+ float dscale = 0;
+ float dsum = 1;
+ int pq;
+ RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info);
+ if (dscale != 0) {
+ if (*ijob == 1 || *ijob == 3)
+ *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+ else
+ *dif = sqrt(pq) / (dscale * sqrt(dsum));
+ }
+ if (isolve == 2) {
+ if (iround == 1) {
+ if (notran)
+ ifunc = *ijob;
+ scale2 = *scale;
+ LAPACK(slacpy)("F", m, n, C, ldC, Work, m);
+ LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m);
+ LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else {
+ LAPACK(slacpy)("F", m, n, Work, m, C, ldC);
+ LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF);
+ *scale = scale2;
+ }
+ }
+ }
+}
+
+
+/** stgsyl's recursive vompute kernel */
+static void RELAPACK_stgsyl_rec(
+ const char *trans, const int *ifunc, const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC,
+ const float *D, const int *ldD, const float *E, const int *ldE,
+ float *F, const int *ldF,
+ float *scale, float *dsum, float *dscale,
+ int *iWork, int *pq, int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) {
+ // Unblocked
+ LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ float scale1[] = { 1. };
+ float scale2[] = { 1. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ int m1 = SREC_SPLIT(*m);
+ if (A[m1 + *ldA * (m1 - 1)])
+ m1++;
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const float *const A_TL = A;
+ const float *const A_TR = A + *ldA * m1;
+ const float *const A_BR = A + *ldA * m1 + m1;
+
+ // C_T
+ // C_B
+ float *const C_T = C;
+ float *const C_B = C + m1;
+
+ // D_TL D_TR
+ // 0 D_BR
+ const float *const D_TL = D;
+ const float *const D_TR = D + *ldD * m1;
+ const float *const D_BR = D + *ldD * m1 + m1;
+
+ // F_T
+ // F_B
+ float *const F_T = F;
+ float *const F_B = F + m1;
+
+ if (*trans == 'N') {
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // F_T = F_T - D_TR * C_B
+ BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+ }
+ } else {
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+ // C_B = C_B - A_TR^H * C_T
+ BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // C_B = C_B - D_TR^H * F_T
+ BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+ }
+ }
+ } else {
+ // Splitting
+ int n1 = SREC_SPLIT(*n);
+ if (B[n1 + *ldB * (n1 - 1)])
+ n1++;
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const float *const B_TL = B;
+ const float *const B_TR = B + *ldB * n1;
+ const float *const B_BR = B + *ldB * n1 + n1;
+
+ // C_L C_R
+ float *const C_L = C;
+ float *const C_R = C + *ldC * n1;
+
+ // E_TL E_TR
+ // 0 E_BR
+ const float *const E_TL = E;
+ const float *const E_TR = E + *ldE * n1;
+ const float *const E_BR = E + *ldE * n1 + n1;
+
+ // F_L F_R
+ float *const F_L = F;
+ float *const F_R = F + *ldF * n1;
+
+ if (*trans == 'N') {
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // C_R = C_R + F_L * B_TR
+ BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+ // F_R = F_R + F_L * E_TR
+ BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+ }
+ } else {
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+ // F_L = F_L + C_R * B_TR
+ BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+ // F_L = F_L + F_R * E_TR
+ BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+ }
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c
new file mode 100644
index 000000000..83947ef1a
--- /dev/null
+++ b/relapack/src/strsyl.c
@@ -0,0 +1,169 @@
+#include "relapack.h"
+
+static void RELAPACK_strsyl_rec(const char *, const char *, const int *,
+ const int *, const int *, const float *, const int *, const float *,
+ const int *, float *, const int *, float *, int *);
+
+
+/** STRSYL solves the real Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's strsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html
+ * */
+void RELAPACK_strsyl(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+
+ // Check arguments
+ const int notransA = LAPACK(lsame)(tranA, "N");
+ const int transA = LAPACK(lsame)(tranA, "T");
+ const int ctransA = LAPACK(lsame)(tranA, "C");
+ const int notransB = LAPACK(lsame)(tranB, "N");
+ const int transB = LAPACK(lsame)(tranB, "T");
+ const int ctransB = LAPACK(lsame)(tranB, "C");
+ *info = 0;
+ if (!transA && !ctransA && !notransA)
+ *info = -1;
+ else if (!transB && !ctransB && !notransB)
+ *info = -2;
+ else if (*isgn != 1 && *isgn != -1)
+ *info = -3;
+ else if (*m < 0)
+ *info = -4;
+ else if (*n < 0)
+ *info = -5;
+ else if (*ldA < MAX(1, *m))
+ *info = -7;
+ else if (*ldB < MAX(1, *n))
+ *info = -9;
+ else if (*ldC < MAX(1, *m))
+ *info = -11;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("STRSYL", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C');
+ const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C');
+
+ // Recursive kernel
+ RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** strsyl's recursive compute kernel */
+static void RELAPACK_strsyl_rec(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const float *A, const int *ldA, const float *B, const int *ldB,
+ float *C, const int *ldC, float *scale,
+ int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) {
+ // Unblocked
+ RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+ const float MSGN[] = { -*isgn };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ float scale1[] = { 1. };
+ float scale2[] = { 1. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ int m1 = SREC_SPLIT(*m);
+ if (A[m1 + *ldA * (m1 - 1)])
+ m1++;
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const float *const A_TL = A;
+ const float *const A_TR = A + *ldA * m1;
+ const float *const A_BR = A + *ldA * m1 + m1;
+
+ // C_T
+ // C_B
+ float *const C_T = C;
+ float *const C_B = C + m1;
+
+ if (*tranA == 'N') {
+ // recusion(A_BR, B, C_B)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // recusion(A_TL, B, C_T)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ } else {
+ // recusion(A_TL, B, C_T)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+ // C_B = C_B - A_TR' * C_T
+ BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // recusion(A_BR, B, C_B)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+ }
+ } else {
+ // Splitting
+ int n1 = SREC_SPLIT(*n);
+ if (B[n1 + *ldB * (n1 - 1)])
+ n1++;
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const float *const B_TL = B;
+ const float *const B_TR = B + *ldB * n1;
+ const float *const B_BR = B + *ldB * n1 + n1;
+
+ // C_L C_R
+ float *const C_L = C;
+ float *const C_R = C + *ldC * n1;
+
+ if (*tranB == 'N') {
+ // recusion(A, B_TL, C_L)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+ // C_R = C_R -/+ C_L * B_TR
+ BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+ // recusion(A, B_BR, C_R)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ } else {
+ // recusion(A, B_BR, C_R)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+ // C_L = C_L -/+ C_R * B_TR'
+ BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+ // recusion(A, B_TL, C_L)
+ RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c
new file mode 100644
index 000000000..6d40a475d
--- /dev/null
+++ b/relapack/src/strsyl_rec2.c
@@ -0,0 +1,1029 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static int c__1 = 1;
+static int c_false = FALSE_;
+static int c__2 = 2;
+static float c_b26 = 1.f;
+static float c_b30 = 0.f;
+static int c_true = TRUE_;
+
+void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int
+ *m, int *n, float *a, int *lda, float *b, int *ldb, float *
+ c__, int *ldc, float *scale, int *info, ftnlen trana_len,
+ ftnlen tranb_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+ i__3, i__4;
+ float r__1, r__2;
+
+ /* Local variables */
+ static int j, k, l;
+ static float x[4] /* was [2][2] */;
+ static int k1, k2, l1, l2;
+ static float a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn;
+ static int ierr;
+ static float smin;
+ extern float sdot_(int *, float *, int *, float *, int *);
+ static float suml, sumr;
+ extern int lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int sscal_(int *, float *, float *, int *);
+ static int knext, lnext;
+ static float xnorm;
+ extern /* Subroutine */ int slaln2_(int *, int *, int *, float
+ *, float *, float *, int *, float *, float *, float *, int *,
+ float *, float *, float *, int *, float *, float *, int *),
+ slasy2_(int *, int *, int *, int *, int *,
+ float *, int *, float *, int *, float *, int *, float *,
+ float *, int *, float *, int *), slabad_(float *, float *);
+ static float scaloc;
+ extern float slamch_(char *, ftnlen), slange_(char *, int *,
+ int *, float *, int *, float *, ftnlen);
+ extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+ static float bignum;
+ static int notrna, notrnb;
+ static float smlnum;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = *ldb;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+ c_dim1 = *ldc;
+ c_offset = 1 + c_dim1;
+ c__ -= c_offset;
+
+ /* Function Body */
+ notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+ notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+ *info = 0;
+ if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_(
+ trana, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -1;
+ } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && !
+ lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -2;
+ } else if (*isgn != 1 && *isgn != -1) {
+ *info = -3;
+ } else if (*m < 0) {
+ *info = -4;
+ } else if (*n < 0) {
+ *info = -5;
+ } else if (*lda < max(1,*m)) {
+ *info = -7;
+ } else if (*ldb < max(1,*n)) {
+ *info = -9;
+ } else if (*ldc < max(1,*m)) {
+ *info = -11;
+ }
+ if (*info != 0) {
+ i__1 = -(*info);
+ xerbla_("STRSYL", &i__1, (ftnlen)6);
+ return;
+ }
+ *scale = 1.f;
+ if (*m == 0 || *n == 0) {
+ return;
+ }
+ eps = slamch_("P", (ftnlen)1);
+ smlnum = slamch_("S", (ftnlen)1);
+ bignum = 1.f / smlnum;
+ slabad_(&smlnum, &bignum);
+ smlnum = smlnum * (float) (*m * *n) / eps;
+ bignum = 1.f / smlnum;
+/* Computing MAX */
+ r__1 = smlnum, r__2 = eps * slange_("M", m, m, &a[a_offset], lda, dum, (
+ ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * slange_("M", n, n,
+ &b[b_offset], ldb, dum, (ftnlen)1);
+ smin = dmax(r__1,r__2);
+ sgn = (float) (*isgn);
+ if (notrna && notrnb) {
+ lnext = 1;
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ if (l < lnext) {
+ goto L70;
+ }
+ if (l == *n) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + 1 + l * b_dim1] != 0.f) {
+ l1 = l;
+ l2 = l + 1;
+ lnext = l + 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l + 1;
+ }
+ }
+ knext = *m;
+ for (k = *m; k >= 1; --k) {
+ if (k > knext) {
+ goto L60;
+ }
+ if (k == 1) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + (k - 1) * a_dim1] != 0.f) {
+ k1 = k - 1;
+ k2 = k;
+ knext = k - 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k - 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__2 = *m - k1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+/* Computing MIN */
+ i__4 = k1 + 1;
+ suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.f;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = dabs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = dabs(vec[0]);
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ r__1 = -sgn * b[l1 + l1 * b_dim1];
+ slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+ * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L20: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__2 = *m - k1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+/* Computing MIN */
+ i__4 = k1 + 1;
+ suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__2 = *m - k1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+/* Computing MIN */
+ i__4 = k1 + 1;
+ suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ r__1 = -sgn * a[k1 + k1 * a_dim1];
+ slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+ b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l1 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = *m - k2;
+/* Computing MIN */
+ i__3 = k2 + 1;
+/* Computing MIN */
+ i__4 = k2 + 1;
+ suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, &
+ c__[min(i__4,*m) + l2 * c_dim1], &c__1);
+ i__2 = l1 - 1;
+ sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ slasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 +
+ k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec,
+ &c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L50: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L60:
+ ;
+ }
+L70:
+ ;
+ }
+ } else if (! notrna && notrnb) {
+ lnext = 1;
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ if (l < lnext) {
+ goto L130;
+ }
+ if (l == *n) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + 1 + l * b_dim1] != 0.f) {
+ l1 = l;
+ l2 = l + 1;
+ lnext = l + 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l + 1;
+ }
+ }
+ knext = 1;
+ i__2 = *m;
+ for (k = 1; k <= i__2; ++k) {
+ if (k < knext) {
+ goto L120;
+ }
+ if (k == *m) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + 1 + k * a_dim1] != 0.f) {
+ k1 = k;
+ k2 = k + 1;
+ knext = k + 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k + 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.f;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = dabs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = dabs(vec[0]);
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.f) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L80: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ r__1 = -sgn * b[l1 + l1 * b_dim1];
+ slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+ a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L90: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ r__1 = -sgn * a[k1 + k1 * a_dim1];
+ slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 *
+ b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 *
+ b_dim1 + 1], &c__1);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__3 = k1 - 1;
+ suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__3 = l1 - 1;
+ sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 *
+ b_dim1 + 1], &c__1);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ slasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1
+ * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+ c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L110: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L120:
+ ;
+ }
+L130:
+ ;
+ }
+ } else if (! notrna && ! notrnb) {
+ lnext = *n;
+ for (l = *n; l >= 1; --l) {
+ if (l > lnext) {
+ goto L190;
+ }
+ if (l == 1) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + (l - 1) * b_dim1] != 0.f) {
+ l1 = l - 1;
+ l2 = l;
+ lnext = l - 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l - 1;
+ }
+ }
+ knext = 1;
+ i__1 = *m;
+ for (k = 1; k <= i__1; ++k) {
+ if (k < knext) {
+ goto L180;
+ }
+ if (k == *m) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + 1 + k * a_dim1] != 0.f) {
+ k1 = k;
+ k2 = k + 1;
+ knext = k + 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k + 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l1;
+/* Computing MIN */
+ i__3 = l1 + 1;
+/* Computing MIN */
+ i__4 = l1 + 1;
+ sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.f;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = dabs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = dabs(vec[0]);
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L140: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ r__1 = -sgn * b[l1 + l1 * b_dim1];
+ slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 *
+ a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L150: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l2 + min(i__4,*n) * b_dim1], ldb);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ r__1 = -sgn * a[k1 + k1 * a_dim1];
+ slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+ * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L160: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc,
+ &b[l2 + min(i__4,*n) * b_dim1], ldb);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+ &b[l1 + min(i__4,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__2 = k1 - 1;
+ suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 *
+ c_dim1 + 1], &c__1);
+ i__2 = *n - l2;
+/* Computing MIN */
+ i__3 = l2 + 1;
+/* Computing MIN */
+ i__4 = l2 + 1;
+ sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc,
+ &b[l2 + min(i__4,*n) * b_dim1], ldb);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ slasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 *
+ a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+ c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L170: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L180:
+ ;
+ }
+L190:
+ ;
+ }
+ } else if (notrna && ! notrnb) {
+ lnext = *n;
+ for (l = *n; l >= 1; --l) {
+ if (l > lnext) {
+ goto L250;
+ }
+ if (l == 1) {
+ l1 = l;
+ l2 = l;
+ } else {
+ if (b[l + (l - 1) * b_dim1] != 0.f) {
+ l1 = l - 1;
+ l2 = l;
+ lnext = l - 2;
+ } else {
+ l1 = l;
+ l2 = l;
+ lnext = l - 1;
+ }
+ }
+ knext = *m;
+ for (k = *m; k >= 1; --k) {
+ if (k > knext) {
+ goto L240;
+ }
+ if (k == 1) {
+ k1 = k;
+ k2 = k;
+ } else {
+ if (a[k + (k - 1) * a_dim1] != 0.f) {
+ k1 = k - 1;
+ k2 = k;
+ knext = k - 2;
+ } else {
+ k1 = k;
+ k2 = k;
+ knext = k - 1;
+ }
+ }
+ if (l1 == l2 && k1 == k2) {
+ i__1 = *m - k1;
+/* Computing MIN */
+ i__2 = k1 + 1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+ suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l1;
+/* Computing MIN */
+ i__2 = l1 + 1;
+/* Computing MIN */
+ i__3 = l1 + 1;
+ sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ scaloc = 1.f;
+ a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1];
+ da11 = dabs(a11);
+ if (da11 <= smin) {
+ a11 = smin;
+ da11 = smin;
+ *info = 1;
+ }
+ db = dabs(vec[0]);
+ if (da11 < 1.f && db > 1.f) {
+ if (db > bignum * da11) {
+ scaloc = 1.f / db;
+ }
+ }
+ x[0] = vec[0] * scaloc / a11;
+ if (scaloc != 1.f) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L200: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ } else if (l1 == l2 && k1 != k2) {
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ r__1 = -sgn * b[l1 + l1 * b_dim1];
+ slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1
+ * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L210: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k2 + l1 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 == k2) {
+ i__1 = *m - k1;
+/* Computing MIN */
+ i__2 = k1 + 1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+ suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn *
+ sumr));
+ i__1 = *m - k1;
+/* Computing MIN */
+ i__2 = k1 + 1;
+/* Computing MIN */
+ i__3 = k1 + 1;
+ suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l2 + min(i__3,*n) * b_dim1], ldb);
+ vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn *
+ sumr));
+ r__1 = -sgn * a[k1 + k1 * a_dim1];
+ slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1
+ * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1,
+ &c_b30, x, &c__2, &scaloc, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L220: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[1];
+ } else if (l1 != l2 && k1 != k2) {
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc,
+ &b[l2 + min(i__3,*n) * b_dim1], ldb);
+ vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l1 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+ &b[l1 + min(i__3,*n) * b_dim1], ldb);
+ vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr);
+ i__1 = *m - k2;
+/* Computing MIN */
+ i__2 = k2 + 1;
+/* Computing MIN */
+ i__3 = k2 + 1;
+ suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, &
+ c__[min(i__3,*m) + l2 * c_dim1], &c__1);
+ i__1 = *n - l2;
+/* Computing MIN */
+ i__2 = l2 + 1;
+/* Computing MIN */
+ i__3 = l2 + 1;
+ sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc,
+ &b[l2 + min(i__3,*n) * b_dim1], ldb);
+ vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr);
+ slasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1
+ * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, &
+ c__2, &scaloc, x, &c__2, &xnorm, &ierr);
+ if (ierr != 0) {
+ *info = 1;
+ }
+ if (scaloc != 1.f) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L230: */
+ }
+ *scale *= scaloc;
+ }
+ c__[k1 + l1 * c_dim1] = x[0];
+ c__[k1 + l2 * c_dim1] = x[2];
+ c__[k2 + l1 * c_dim1] = x[1];
+ c__[k2 + l2 * c_dim1] = x[3];
+ }
+L240:
+ ;
+ }
+L250:
+ ;
+ }
+ }
+}
diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c
new file mode 100644
index 000000000..d35bbd49f
--- /dev/null
+++ b/relapack/src/strtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_strtri_rec(const char *, const char *, const int *,
+ float *, const int *, int *);
+
+
+/** CTRTRI computes the inverse of a real upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's strtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html
+ * */
+void RELAPACK_strtri(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int nounit = LAPACK(lsame)(diag, "N");
+ const int unit = LAPACK(lsame)(diag, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (!nounit && !unit)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("STRTRI", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleandiag = nounit ? 'N' : 'U';
+
+ // check for singularity
+ if (nounit) {
+ int i;
+ for (i = 0; i < *n; i++)
+ if (A[i + *ldA * i] == 0) {
+ *info = i;
+ return;
+ }
+ }
+
+ // Recursive kernel
+ RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** strtri's recursive compute kernel */
+static void RELAPACK_strtri_rec(
+ const char *uplo, const char *diag, const int *n,
+ float *A, const int *ldA,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_STRTRI, 1)) {
+ // Unblocked
+ LAPACK(strti2)(uplo, diag, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const float ONE[] = { 1. };
+ const float MONE[] = { -1. };
+
+ // Splitting
+ const int n1 = SREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ float *const A_TL = A;
+ float *const A_TR = A + *ldA * n1;
+ float *const A_BL = A + n1;
+ float *const A_BR = A + *ldA * n1 + n1;
+
+ // recursion(A_TL)
+ RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = - A_BL * A_TL
+ BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+ // A_BL = A_BR \ A_BL
+ BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TR = - A_TL * A_TR
+ BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+ // A_TR = A_TR / A_BR
+ BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c
new file mode 100644
index 000000000..3aa6bf531
--- /dev/null
+++ b/relapack/src/zgbtrf.c
@@ -0,0 +1,230 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *,
+ const int *, double *, const int *, int *, double *, const int *, double *,
+ const int *, int *);
+
+
+/** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's zgbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html
+ * */
+void RELAPACK_zgbtrf(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kl < 0)
+ *info = -3;
+ else if (*ku < 0)
+ *info = -4;
+ else if (*ldAb < 2 * *kl + *ku + 1)
+ *info = -6;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZGBTRF", &minfo);
+ return;
+ }
+
+ // Constant
+ const double ZERO[] = { 0., 0. };
+
+ // Result upper band width
+ const int kv = *ku + *kl;
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ double *const A = Ab + 2 * kv;
+
+ // Zero upper diagonal fill-in elements
+ int i, j;
+ for (j = 0; j < *n; j++) {
+ double *const A_j = A + 2 * *ldA * j;
+ for (i = MAX(0, j - kv); i < j - *ku; i++)
+ A_j[2 * i] = A_j[2 * i + 1] = 0.;
+ }
+
+ // Allocate work space
+ const int n1 = ZREC_SPLIT(*n);
+ const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
+ const int nWorkl = (kv > n1) ? n1 : kv;
+ const int mWorku = (*kl > n1) ? n1 : *kl;
+ const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+ double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
+ double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
+ LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
+ LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
+
+ // Recursive kernel
+ RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
+
+ // Free work space
+ free(Workl);
+ free(Worku);
+}
+
+
+/** zgbtrf's recursive compute kernel */
+static void RELAPACK_zgbtrf_rec(
+ const int *m, const int *n, const int *kl, const int *ku,
+ double *Ab, const int *ldAb, int *ipiv,
+ double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) {
+ // Unblocked
+ LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ // Loop iterators
+ int i, j;
+
+ // Output upper band width
+ const int kv = *ku + *kl;
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ double *const A = Ab + 2 * kv;
+
+ // Splitting
+ const int n1 = MIN(ZREC_SPLIT(*n), *kl);
+ const int n2 = *n - n1;
+ const int m1 = MIN(n1, *m);
+ const int m2 = *m - m1;
+ const int mn1 = MIN(m1, n1);
+ const int mn2 = MIN(m2, n2);
+
+ // Ab_L *
+ // Ab_BR
+ double *const Ab_L = Ab;
+ double *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+ // A_L A_R
+ double *const A_L = A;
+ double *const A_R = A + 2 * *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + 2 * *ldA * n1;
+ double *const A_BL = A + 2 * m1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * m1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // Banded splitting
+ const int n21 = MIN(n2, kv - n1);
+ const int n22 = MIN(n2 - n21, n1);
+ const int m21 = MIN(m2, *kl - m1);
+ const int m22 = MIN(m2 - m21, m1);
+
+ // n1 n21 n22
+ // m * A_Rl ARr
+ double *const A_Rl = A_R;
+ double *const A_Rr = A_R + 2 * *ldA * n21;
+
+ // n1 n21 n22
+ // m1 * A_TRl A_TRr
+ // m21 A_BLt A_BRtl A_BRtr
+ // m22 A_BLb A_BRbl A_BRbr
+ double *const A_TRl = A_TR;
+ double *const A_TRr = A_TR + 2 * *ldA * n21;
+ double *const A_BLt = A_BL;
+ double *const A_BLb = A_BL + 2 * m21;
+ double *const A_BRtl = A_BR;
+ double *const A_BRtr = A_BR + 2 * *ldA * n21;
+ double *const A_BRbl = A_BR + 2 * m21;
+ double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;
+
+ // recursion(Ab_L, ipiv_T)
+ RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
+
+ // Workl = A_BLb
+ LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);
+
+ // partially redo swaps in A_L
+ for (i = 0; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+ else
+ BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+ }
+ }
+
+ // apply pivots to A_Rl
+ LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);
+
+ // apply pivots to A_Rr columnwise
+ for (j = 0; j < n22; j++) {
+ double *const A_Rrj = A_Rr + 2 * *ldA * j;
+ for (i = j; i < mn1; i++) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ const double tmpr = A_Rrj[2 * i];
+ const double tmpc = A_Rrj[2 * i + 1];
+ A_Rrj[2 * i] = A_Rrj[2 * ip];
+ A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1];
+ A_Rrj[2 * ip] = tmpr;
+ A_Rrj[2 * ip + 1] = tmpc;
+ }
+ }
+ }
+
+ // A_TRl = A_TL \ A_TRl
+ BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // Worku = A_TRr
+ LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
+ // Worku = A_TL \ Worku
+ BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
+ // A_TRr = Worku
+ LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_TRl
+ BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // A_BRbl = A_BRbl - Workl * A_TRl
+ BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
+ // A_BRtr = A_BRtr - A_BLt * Worku
+ BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Workl * Worku
+ BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);
+
+ // partially undo swaps in A_L
+ for (i = mn1 - 1; i >= 0; i--) {
+ const int ip = ipiv_T[i] - 1;
+ if (ip != i) {
+ if (ip < *kl)
+ BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
+ else
+ BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
+ }
+ }
+
+ // recursion(Ab_BR, ipiv_B)
+ RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ if (*info)
+ *info += n1;
+ // shift pivots
+ for (i = 0; i < mn2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c
new file mode 100644
index 000000000..aa5930238
--- /dev/null
+++ b/relapack/src/zgemmt.c
@@ -0,0 +1,167 @@
+#include "relapack.h"
+
+static void RELAPACK_zgemmt_rec(const char *, const char *, const char *,
+ const int *, const int *, const double *, const double *, const int *,
+ const double *, const int *, const double *, double *, const int *);
+
+static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *,
+ const int *, const int *, const double *, const double *, const int *,
+ const double *, const int *, const double *, double *, const int *);
+
+
+/** ZGEMMT computes a matrix-matrix product with general matrices but updates
+ * only the upper or lower triangular part of the result matrix.
+ *
+ * This routine performs the same operation as the BLAS routine
+ * zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
+ * but only updates the triangular part of C specified by uplo:
+ * If (*uplo == 'L'), only the lower triangular part of C is updated,
+ * otherwise the upper triangular part is updated.
+ * */
+void RELAPACK_zgemmt(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+
+#if HAVE_XGEMMT
+ BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+#else
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int notransA = LAPACK(lsame)(transA, "N");
+ const int tranA = LAPACK(lsame)(transA, "T");
+ const int ctransA = LAPACK(lsame)(transA, "C");
+ const int notransB = LAPACK(lsame)(transB, "N");
+ const int tranB = LAPACK(lsame)(transB, "T");
+ const int ctransB = LAPACK(lsame)(transB, "C");
+ int info = 0;
+ if (!lower && !upper)
+ info = 1;
+ else if (!tranA && !ctransA && !notransA)
+ info = 2;
+ else if (!tranB && !ctransB && !notransB)
+ info = 3;
+ else if (*n < 0)
+ info = 4;
+ else if (*k < 0)
+ info = 5;
+ else if (*ldA < MAX(1, notransA ? *n : *k))
+ info = 8;
+ else if (*ldB < MAX(1, notransB ? *k : *n))
+ info = 10;
+ else if (*ldC < MAX(1, *n))
+ info = 13;
+ if (info) {
+ LAPACK(xerbla)("ZGEMMT", &info);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
+ const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');
+
+ // Recursive kernel
+ RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+#endif
+}
+
+
+/** zgemmt's recursive compute kernel */
+static void RELAPACK_zgemmt_rec(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+
+ if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) {
+ // Unblocked
+ RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
+ return;
+ }
+
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_T
+ // A_B
+ const double *const A_T = A;
+ const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);
+
+ // B_L B_R
+ const double *const B_L = B;
+ const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);
+
+ // C_TL C_TR
+ // C_BL C_BR
+ double *const C_TL = C;
+ double *const C_TR = C + 2 * *ldC * n1;
+ double *const C_BL = C + 2 * n1;
+ double *const C_BR = C + 2 * *ldC * n1 + 2 * n1;
+
+ // recursion(C_TL)
+ RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);
+
+ if (*uplo == 'L')
+ // C_BL = alpha A_B B_L + beta C_BL
+ BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
+ else
+ // C_TR = alpha A_T B_R + beta C_TR
+ BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);
+
+ // recursion(C_BR)
+ RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
+}
+
+
+/** zgemmt's unblocked compute kernel */
+static void RELAPACK_zgemmt_rec2(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const double *alpha, const double *A, const int *ldA,
+ const double *B, const int *ldB,
+ const double *beta, double *C, const int *ldC
+) {
+
+ const int incB = (*transB == 'N') ? 1 : *ldB;
+ const int incC = 1;
+
+ int i;
+ for (i = 0; i < *n; i++) {
+ // A_0
+ // A_i
+ const double *const A_0 = A;
+ const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);
+
+ // * B_i *
+ const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);
+
+ // * C_0i *
+ // * C_ii *
+ double *const C_0i = C + 2 * *ldC * i;
+ double *const C_ii = C + 2 * *ldC * i + 2 * i;
+
+ if (*uplo == 'L') {
+ const int nmi = *n - i;
+ if (*transA == 'N')
+ BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ else
+ BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
+ } else {
+ const int ip1 = i + 1;
+ if (*transA == 'N')
+ BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ else
+ BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
+ }
+ }
+}
diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c
new file mode 100644
index 000000000..cf8921e1f
--- /dev/null
+++ b/relapack/src/zgetrf.c
@@ -0,0 +1,117 @@
+#include "relapack.h"
+
+static void RELAPACK_zgetrf_rec(const int *, const int *, double *,
+ const int *, int *, int *);
+
+
+/** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
+ *
+ * This routine is functionally equivalent to LAPACK's zgetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html
+ * */
+void RELAPACK_zgetrf(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ // Check arguments
+ *info = 0;
+ if (*m < 0)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZGETRF", &minfo);
+ return;
+ }
+
+ const int sn = MIN(*m, *n);
+
+ RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info);
+
+ // Right remainder
+ if (*m < *n) {
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const int iONE[] = { 1 };
+
+ // Splitting
+ const int rn = *n - *m;
+
+ // A_L A_R
+ const double *const A_L = A;
+ double *const A_R = A + 2 * *ldA * *m;
+
+ // A_R = apply(ipiv, A_R)
+ LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
+ // A_R = A_L \ A_R
+ BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
+ }
+}
+
+
+/** zgetrf's recursive compute kernel */
+static void RELAPACK_zgetrf_rec(
+ const int *m, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_ZGETRF, 1)) {
+ // Unblocked
+ LAPACK(zgetf2)(m, n, A, ldA, ipiv, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const int iONE[] = { 1. };
+
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+ const int m2 = *m - n1;
+
+ // A_L A_R
+ double *const A_L = A;
+ double *const A_R = A + 2 * *ldA * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + 2 * *ldA * n1;
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_T = ipiv;
+ int *const ipiv_B = ipiv + n1;
+
+ // recursion(A_L, ipiv_T)
+ RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
+ // apply pivots to A_R
+ LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);
+
+ // A_TR = A_TL \ A_TR
+ BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_BL * A_TR
+ BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);
+
+ // recursion(A_BR, ipiv_B)
+ RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
+ if (*info)
+ *info += n1;
+ // apply pivots to A_BL
+ LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ ipiv_B[i] += n1;
+}
diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c
new file mode 100644
index 000000000..d0ece2148
--- /dev/null
+++ b/relapack/src/zhegst.c
@@ -0,0 +1,212 @@
+#include "relapack.h"
+#if XSYGST_ALLOW_MALLOC
+#include "stdlib.h"
+#endif
+
+static void RELAPACK_zhegst_rec(const int *, const char *, const int *,
+ double *, const int *, const double *, const int *,
+ double *, const int *, int *);
+
+
+/** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
+ *
+ * This routine is functionally equivalent to LAPACK's zhegst.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html
+ * */
+void RELAPACK_zhegst(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (*itype < 1 || *itype > 3)
+ *info = -1;
+ else if (!lower && !upper)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ else if (*ldB < MAX(1, *n))
+ *info = -7;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZHEGST", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Allocate work space
+ double *Work = NULL;
+ int lWork = 0;
+#if XSYGST_ALLOW_MALLOC
+ const int n1 = ZREC_SPLIT(*n);
+ lWork = n1 * (*n - n1);
+ Work = malloc(lWork * 2 * sizeof(double));
+ if (!Work)
+ lWork = 0;
+#endif
+
+ // recursive kernel
+ RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);
+
+ // Free work space
+#if XSYGST_ALLOW_MALLOC
+ if (Work)
+ free(Work);
+#endif
+}
+
+
+/** zhegst's recursive compute kernel */
+static void RELAPACK_zhegst_rec(
+ const int *itype, const char *uplo, const int *n,
+ double *A, const int *ldA, const double *B, const int *ldB,
+ double *Work, const int *lWork, int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_ZHEGST, 1)) {
+ // Unblocked
+ LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info);
+ return;
+ }
+
+ // Constants
+ const double ZERO[] = { 0., 0. };
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const double HALF[] = { .5, 0. };
+ const double MHALF[] = { -.5, 0. };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + 2 * *ldA * n1;
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // B_TL B_TR
+ // B_BL B_BR
+ const double *const B_TL = B;
+ const double *const B_TR = B + 2 * *ldB * n1;
+ const double *const B_BL = B + 2 * n1;
+ const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+ // recursion(A_TL, B_TL)
+ RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);
+
+ if (*itype == 1)
+ if (*uplo == 'L') {
+ // A_BL = A_BL / B_TL'
+ BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork >= n2 * n1) {
+ // T = -1/2 * B_BL * A_TL
+ BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ } else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
+ BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
+ if (*lWork >= n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ else
+ // A_BL = A_BL - 1/2 B_BL * A_TL
+ BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR \ A_BL
+ BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL' \ A_TR
+ BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork >= n2 * n1) {
+ // T = -1/2 * A_TL * B_TR
+ BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ } else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
+ BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
+ if (*lWork >= n2 * n1)
+ // A_TR = A_BL + T
+ for (i = 0; i < n2; i++)
+ BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ else
+ // A_TR = A_TR - 1/2 A_TL * B_TR
+ BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR / B_BR
+ BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+ else
+ if (*uplo == 'L') {
+ // A_BL = A_BL * B_TL
+ BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
+ if (*lWork >= n2 * n1) {
+ // T = 1/2 * A_BR * B_BL
+ BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ } else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
+ BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
+ if (*lWork >= n2 * n1)
+ // A_BL = A_BL + T
+ for (i = 0; i < n1; i++)
+ BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
+ else
+ // A_BL = A_BL + 1/2 A_BR * B_BL
+ BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
+ // A_BL = B_BR * A_BL
+ BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
+ } else {
+ // A_TR = B_TL * A_TR
+ BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
+ if (*lWork >= n2 * n1) {
+ // T = 1/2 * B_TR * A_BR
+ BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ } else
+ // A_TR = A_TR + 1/2 B_TR A_BR
+ BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
+ BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
+ if (*lWork >= n2 * n1)
+ // A_TR = A_TR + T
+ for (i = 0; i < n2; i++)
+ BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
+ else
+ // A_TR = A_TR + 1/2 B_TR * A_BR
+ BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
+ // A_TR = A_TR * B_BR
+ BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
+ }
+
+ // recursion(A_BR, B_BR)
+ RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
+}
diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c
new file mode 100644
index 000000000..ef4e1f5d5
--- /dev/null
+++ b/relapack/src/zhetrf.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *,
+ double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zhetrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html
+ * */
+void RELAPACK_zhetrf(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZHETRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** zhetrf's recursive compute kernel */
+static void RELAPACK_zhetrf_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_ZHETRF, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = ZREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ double *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ double *const A_BL_B = A + 2 * *n;
+ double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ double *const Work_BL = Work + 2 * n1;
+ double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ double *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = ZREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ double *const A_TL_T = A + 2 * *ldA * n_rest;
+ double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ double *const Work_L = Work;
+ double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c
new file mode 100644
index 000000000..867ea64e1
--- /dev/null
+++ b/relapack/src/zhetrf_rec2.c
@@ -0,0 +1,524 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's zlahef.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int *
+ nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
+ doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ double d__1, d__2, d__3, d__4;
+ doublecomplex z__1, z__2, z__3, z__4;
+
+ /* Builtin functions */
+ double sqrt(double), d_imag(doublecomplex *);
+ void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
+ doublecomplex *, doublecomplex *);
+
+ /* Local variables */
+ static int j, k;
+ static double t, r1;
+ static doublecomplex d11, d21, d22;
+ static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+ static double alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ static int kstep;
+ extern /* Subroutine */ int zgemv_(char *, int *, int *,
+ doublecomplex *, doublecomplex *, int *, doublecomplex *,
+ int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+ zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+ int *), zswap_(int *, doublecomplex *, int *,
+ doublecomplex *, int *);
+ static double absakk;
+ extern /* Subroutine */ int zdscal_(int *, double *,
+ doublecomplex *, int *);
+ static double colmax;
+ extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
+ ;
+ extern int izamax_(int *, doublecomplex *, int *);
+ static double rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.) + 1.) / 8.;
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ i__1 = k - 1;
+ zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = k + kw * w_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ i__1 = k + kw * w_dim1;
+ i__2 = k + kw * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ }
+ i__1 = k + kw * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ kw * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - 1;
+ zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ i__1 = k - imax;
+ zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ i__1 = k - imax;
+ zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + (kw - 1) * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ }
+ i__1 = k - imax;
+ jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+ &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ jmax + (kw - 1) * w_dim1]), abs(d__2));
+ if (imax > 1) {
+ i__1 = imax - 1;
+ jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (kw - 1) * w_dim1;
+ d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+ d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
+ d__2));
+ rowmax = max(d__3,d__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (kw - 1) * w_dim1;
+ if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
+ kp = imax;
+ zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ i__1 = kk - 1 - kp;
+ zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ i__1 = kk - 1 - kp;
+ zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ i__1 = k + k * a_dim1;
+ r1 = 1. / a[i__1].r;
+ i__1 = k - 1;
+ zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ i__1 = k - 1;
+ zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ }
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ d_cnjg(&z__2, &d21);
+ z_div(&z__1, &w[k + kw * w_dim1], &z__2);
+ d11.r = z__1.r, d11.i = z__1.i;
+ z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1. / (z__1.r - 1.);
+ z__2.r = t, z__2.i = 0.;
+ z_div(&z__1, &z__2, &d21);
+ d21.r = z__1.r, d21.i = z__1.i;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+ .i;
+ z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+ d21.r * z__2.i + d21.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + k * a_dim1;
+ d_cnjg(&z__2, &d21);
+ i__3 = j + kw * w_dim1;
+ z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
+ z__2.r * z__3.i + z__2.i * z__3.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1;
+ zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = k - 2;
+ zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ ++j;
+ }
+ ++j;
+ if (jp != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+ }
+ if (j < *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ if (k < *n) {
+ i__1 = *n - k;
+ zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+ w_dim1], &c__1);
+ }
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+ + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ i__1 = k + k * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ k * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - k;
+ zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = imax - k;
+ zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ if (imax < *n) {
+ i__1 = *n - imax;
+ zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+ imax + 1 + (k + 1) * w_dim1], &c__1);
+ }
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
+ lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+ w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + (k + 1) * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ i__1 = imax - k;
+ jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+ ;
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ jmax + (k + 1) * w_dim1]), abs(d__2));
+ if (imax < *n) {
+ i__1 = *n - imax;
+ jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (k + 1) * w_dim1;
+ d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+ d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
+ d__2));
+ rowmax = max(d__3,d__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (k + 1) * w_dim1;
+ if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+ k * w_dim1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k + kstep - 1;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ i__1 = kp - kk - 1;
+ zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ i__1 = kp - kk - 1;
+ zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ i__1 = k + k * a_dim1;
+ r1 = 1. / a[i__1].r;
+ i__1 = *n - k;
+ zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ i__1 = *n - k;
+ zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = z__1.r, d11.i = z__1.i;
+ d_cnjg(&z__2, &d21);
+ z_div(&z__1, &w[k + k * w_dim1], &z__2);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1. / (z__1.r - 1.);
+ z__2.r = t, z__2.i = 0.;
+ z_div(&z__1, &z__2, &d21);
+ d21.r = z__1.r, d21.i = z__1.i;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ d_cnjg(&z__2, &d21);
+ i__3 = j + k * w_dim1;
+ z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i =
+ z__2.r * z__3.i + z__2.i * z__3.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+ .i;
+ z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+ d21.r * z__2.i + d21.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = *n - k;
+ zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = *n - k - 1;
+ zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ --j;
+ }
+ --j;
+ if (jp != jj && j >= 1) {
+ zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j > 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c
new file mode 100644
index 000000000..15ceaeae7
--- /dev/null
+++ b/relapack/src/zhetrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *,
+ double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zhetrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html
+ * */
+void RELAPACK_zhetrf_rook(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZHETRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** zhetrf_rook's recursive compute kernel */
+static void RELAPACK_zhetrf_rook_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = ZREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ double *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ double *const A_BL_B = A + 2 * *n;
+ double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ double *const Work_BL = Work + 2 * n1;
+ double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ double *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = ZREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ double *const A_TL_T = A + 2 * *ldA * n_rest;
+ double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ double *const Work_L = Work;
+ double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c
new file mode 100644
index 000000000..a56ad710b
--- /dev/null
+++ b/relapack/src/zhetrf_rook_rec2.c
@@ -0,0 +1,662 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
+ *
+ * This routine is a minor modification of LAPACK's zlahef_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n,
+ int *nb, int *kb, doublecomplex *a, int *lda, int *
+ ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ double d__1, d__2;
+ doublecomplex z__1, z__2, z__3, z__4, z__5;
+
+ /* Builtin functions */
+ double sqrt(double), d_imag(doublecomplex *);
+ void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *,
+ doublecomplex *, doublecomplex *);
+
+ /* Local variables */
+ static int j, k, p;
+ static double t, r1;
+ static doublecomplex d11, d21, d22;
+ static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+ static logical done;
+ static int imax, jmax;
+ static double alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ static double dtemp, sfmin;
+ static int itemp, kstep;
+ extern /* Subroutine */ int zgemv_(char *, int *, int *,
+ doublecomplex *, doublecomplex *, int *, doublecomplex *,
+ int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+ zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+ int *), zswap_(int *, doublecomplex *, int *,
+ doublecomplex *, int *);
+ extern double dlamch_(char *, ftnlen);
+ static double absakk;
+ extern /* Subroutine */ int zdscal_(int *, double *,
+ doublecomplex *, int *);
+ static double colmax;
+ extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *)
+ ;
+ extern int izamax_(int *, doublecomplex *, int *);
+ static double rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.) + 1.) / 8.;
+ sfmin = dlamch_("S", (ftnlen)1);
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ p = k;
+ if (k > 1) {
+ i__1 = k - 1;
+ zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
+ c__1);
+ }
+ i__1 = k + kw * w_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ i__1 = k + kw * w_dim1;
+ i__2 = k + kw * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ }
+ i__1 = k + kw * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ kw * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ d__1 = w[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ if (k > 1) {
+ i__1 = k - 1;
+ zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
+ &c__1);
+ }
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L12:
+ if (imax > 1) {
+ i__1 = imax - 1;
+ zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ }
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ i__1 = k - imax;
+ zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ i__1 = k - imax;
+ zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ i__1 = imax + (kw - 1) * w_dim1;
+ i__2 = imax + (kw - 1) * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ }
+ if (imax != k) {
+ i__1 = k - imax;
+ jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
+ w_dim1], &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+ w[jmax + (kw - 1) * w_dim1]), abs(d__2));
+ } else {
+ rowmax = 0.;
+ }
+ if (imax > 1) {
+ i__1 = imax - 1;
+ itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ i__1 = itemp + (kw - 1) * w_dim1;
+ dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ itemp + (kw - 1) * w_dim1]), abs(d__2));
+ if (dtemp > rowmax) {
+ rowmax = dtemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (kw - 1) * w_dim1;
+ if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
+ kp = imax;
+ zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ }
+ if (! done) {
+ goto L12;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kstep == 2 && p != k) {
+ i__1 = p + p * a_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ i__1 = k - 1 - p;
+ zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+ a_dim1], lda);
+ i__1 = k - 1 - p;
+ zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
+ if (p > 1) {
+ i__1 = p - 1;
+ zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
+ 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
+ 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+ ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ i__1 = kk - 1 - kp;
+ zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ i__1 = kk - 1 - kp;
+ zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ i__1 = k + k * a_dim1;
+ t = a[i__1].r;
+ if (abs(t) >= sfmin) {
+ r1 = 1. / t;
+ i__1 = k - 1;
+ zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ i__1 = k - 1;
+ for (ii = 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ i__3 = ii + k * a_dim1;
+ z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L14: */
+ }
+ }
+ i__1 = k - 1;
+ zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ }
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ d_cnjg(&z__2, &d21);
+ z_div(&z__1, &w[k + kw * w_dim1], &z__2);
+ d11.r = z__1.r, d11.i = z__1.i;
+ z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1. / (z__1.r - 1.);
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z_div(&z__2, &z__3, &d21);
+ z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + k * a_dim1;
+ i__3 = j + kw * w_dim1;
+ z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ d_cnjg(&z__5, &d21);
+ z_div(&z__2, &z__3, &z__5);
+ z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1;
+ zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = k - 2;
+ zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ ++j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ ++j;
+ if (jp2 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ ++jj;
+ if (kstep == 2 && jp1 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ if (j < *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ p = k;
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ if (k < *n) {
+ i__1 = *n - k;
+ zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
+ w_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
+ w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+ ftnlen)12);
+ i__1 = k + k * w_dim1;
+ i__2 = k + k * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ }
+ i__1 = k + k * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ k * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ d__1 = w[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ if (k < *n) {
+ i__1 = *n - k;
+ zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
+ a_dim1], &c__1);
+ }
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L72:
+ i__1 = imax - k;
+ zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = imax - k;
+ zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + imax * a_dim1;
+ d__1 = a[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ if (imax < *n) {
+ i__1 = *n - imax;
+ zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
+ imax + 1 + (k + 1) * w_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
+ , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+ 1) * w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax + (k + 1) * w_dim1;
+ i__2 = imax + (k + 1) * w_dim1;
+ d__1 = w[i__2].r;
+ w[i__1].r = d__1, w[i__1].i = 0.;
+ }
+ if (imax != k) {
+ i__1 = imax - k;
+ jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+ c__1);
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+ w[jmax + (k + 1) * w_dim1]), abs(d__2));
+ } else {
+ rowmax = 0.;
+ }
+ if (imax < *n) {
+ i__1 = *n - imax;
+ itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = itemp + (k + 1) * w_dim1;
+ dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ itemp + (k + 1) * w_dim1]), abs(d__2));
+ if (dtemp > rowmax) {
+ rowmax = dtemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (k + 1) * w_dim1;
+ if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ }
+ if (! done) {
+ goto L72;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kstep == 2 && p != k) {
+ i__1 = p + p * a_dim1;
+ i__2 = k + k * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ i__1 = p - k - 1;
+ zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
+ a_dim1], lda);
+ i__1 = p - k - 1;
+ zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
+ if (p < *n) {
+ i__1 = *n - p;
+ zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
+ * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+ }
+ zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ d__1 = a[i__2].r;
+ a[i__1].r = d__1, a[i__1].i = 0.;
+ i__1 = kp - kk - 1;
+ zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ i__1 = kp - kk - 1;
+ zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ i__1 = k + k * a_dim1;
+ t = a[i__1].r;
+ if (abs(t) >= sfmin) {
+ r1 = 1. / t;
+ i__1 = *n - k;
+ zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ } else {
+ i__1 = *n;
+ for (ii = k + 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ i__3 = ii + k * a_dim1;
+ z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L74: */
+ }
+ }
+ i__1 = *n - k;
+ zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = z__1.r, d11.i = z__1.i;
+ d_cnjg(&z__2, &d21);
+ z_div(&z__1, &w[k + k * w_dim1], &z__2);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r *
+ d22.i + d11.i * d22.r;
+ t = 1. / (z__1.r - 1.);
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ i__3 = j + k * w_dim1;
+ z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ d_cnjg(&z__5, &d21);
+ z_div(&z__2, &z__3, &z__5);
+ z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z_div(&z__2, &z__3, &d21);
+ z__1.r = t * z__2.r, z__1.i = t * z__2.i;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = *n - k;
+ zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = *n - k - 1;
+ zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ --j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ --j;
+ if (jp2 != jj && j >= 1) {
+ zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ --jj;
+ if (kstep == 2 && jp1 != jj && j >= 1) {
+ zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j > 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c
new file mode 100644
index 000000000..490dcc82e
--- /dev/null
+++ b/relapack/src/zlauum.c
@@ -0,0 +1,87 @@
+#include "relapack.h"
+
+static void RELAPACK_zlauum_rec(const char *, const int *, double *,
+ const int *, int *);
+
+
+/** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
+ *
+ * This routine is functionally equivalent to LAPACK's zlauum.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html
+ * */
+void RELAPACK_zlauum(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZLAUUM", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** zlauum's recursive compute kernel */
+static void RELAPACK_zlauum_rec(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) {
+ // Unblocked
+ LAPACK(zlauu2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + 2 * *ldA * n1;
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info);
+
+ if (*uplo == 'L') {
+ // A_TL = A_TL + A_BL' * A_BL
+ BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
+ // A_BL = A_BR' * A_BL
+ BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TL = A_TL + A_TR * A_TR'
+ BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
+ // A_TR = A_TR * A_BR'
+ BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info);
+}
diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c
new file mode 100644
index 000000000..37e711c9d
--- /dev/null
+++ b/relapack/src/zpbtrf.c
@@ -0,0 +1,157 @@
+#include "relapack.h"
+#include "stdlib.h"
+
+static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *,
+ double *, const int *, double *, const int *, int *);
+
+
+/** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's zpbtrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html
+ * */
+void RELAPACK_zpbtrf(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*kd < 0)
+ *info = -3;
+ else if (*ldAb < *kd + 1)
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZPBTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Constant
+ const double ZERO[] = { 0., 0. };
+
+ // Allocate work space
+ const int n1 = ZREC_SPLIT(*n);
+ const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
+ const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
+ double *Work = malloc(mWork * nWork * 2 * sizeof(double));
+ LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);
+
+ // Recursive kernel
+ RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);
+
+ // Free work space
+ free(Work);
+}
+
+
+/** zpbtrf's recursive compute kernel */
+static void RELAPACK_zpbtrf_rec(
+ const char *uplo, const int *n, const int *kd,
+ double *Ab, const int *ldAb,
+ double *Work, const int *ldWork,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) {
+ // Unblocked
+ LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+
+ // Unskew A
+ const int ldA[] = { *ldAb - 1 };
+ double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);
+
+ // Splitting
+ const int n1 = MIN(ZREC_SPLIT(*n), *kd);
+ const int n2 = *n - n1;
+
+ // * *
+ // * Ab_BR
+ double *const Ab_BR = Ab + 2 * *ldAb * n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + 2 * *ldA * n1;
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ // Banded splitting
+ const int n21 = MIN(n2, *kd - n1);
+ const int n22 = MIN(n2 - n21, *kd);
+
+ // n1 n21 n22
+ // n1 * A_TRl A_TRr
+ // n21 A_BLt A_BRtl A_BRtr
+ // n22 A_BLb A_BRbl A_BRbr
+ double *const A_TRl = A_TR;
+ double *const A_TRr = A_TR + 2 * *ldA * n21;
+ double *const A_BLt = A_BL;
+ double *const A_BLb = A_BL + 2 * n21;
+ double *const A_BRtl = A_BR;
+ double *const A_BRtr = A_BR + 2 * *ldA * n21;
+ double *const A_BRbl = A_BR + 2 * n21;
+ double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21;
+
+ if (*uplo == 'L') {
+ // A_BLt = ABLt / A_TL'
+ BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
+ // A_BRtl = A_BRtl - A_BLt * A_BLt'
+ BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
+ // Work = A_BLb
+ LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
+ // Work = Work / A_TL'
+ BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRbl = A_BRbl - Work * A_BLt'
+ BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
+ // A_BRbr = A_BRbr - Work * Work'
+ BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_BLb = Work
+ LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
+ } else {
+ // A_TRl = A_TL' \ A_TRl
+ BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
+ // A_BRtl = A_BRtl - A_TRl' * A_TRl
+ BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
+ // Work = A_TRr
+ LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
+ // Work = A_TL' \ Work
+ BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
+ // A_BRtr = A_BRtr - A_TRl' * Work
+ BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
+ // A_BRbr = A_BRbr - Work' * Work
+ BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
+ // A_TRr = Work
+ LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
+ }
+
+ // recursion(A_BR)
+ if (*kd > n1)
+ RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info);
+ else
+ RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c
new file mode 100644
index 000000000..411ac5fc0
--- /dev/null
+++ b/relapack/src/zpotrf.c
@@ -0,0 +1,92 @@
+#include "relapack.h"
+
+static void RELAPACK_zpotrf_rec(const char *, const int *, double *,
+ const int *, int *);
+
+
+/** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's zpotrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html
+ * */
+void RELAPACK_zpotrf(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZPOTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Recursive kernel
+ RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info);
+}
+
+
+/** zpotrf's recursive compute kernel */
+static void RELAPACK_zpotrf_rec(
+ const char *uplo, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) {
+ // Unblocked
+ LAPACK(zpotf2)(uplo, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + 2 * *ldA * n1;
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = A_BL / A_TL'
+ BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
+ // A_BR = A_BR - A_BL * A_BL'
+ BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
+ } else {
+ // A_TR = A_TL' \ A_TR
+ BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
+ // A_BR = A_BR - A_TR' * A_TR
+ BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c
new file mode 100644
index 000000000..3be21563a
--- /dev/null
+++ b/relapack/src/zsytrf.c
@@ -0,0 +1,238 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *,
+ double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zsytrf.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html
+ * */
+void RELAPACK_zsytrf(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy arguments
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** zsytrf's recursive compute kernel */
+static void RELAPACK_zsytrf_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ // Loop iterator
+ int i;
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = ZREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ double *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ double *const A_BL_B = A + 2 * *n;
+ double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ double *const Work_BL = Work + 2 * n1;
+ double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ double *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = ZREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ double *const A_TL_T = A + 2 * *ldA * n_rest;
+ double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ double *const Work_L = Work;
+ double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c
new file mode 100644
index 000000000..33902ee9e
--- /dev/null
+++ b/relapack/src/zsytrf_rec2.c
@@ -0,0 +1,452 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's zlasyf.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int *
+ nb, int *kb, doublecomplex *a, int *lda, int *ipiv,
+ doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ double d__1, d__2, d__3, d__4;
+ doublecomplex z__1, z__2, z__3;
+
+ /* Builtin functions */
+ double sqrt(double), d_imag(doublecomplex *);
+ void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
+
+ /* Local variables */
+ static int j, k;
+ static doublecomplex t, r1, d11, d21, d22;
+ static int jj, kk, jp, kp, kw, kkw, imax, jmax;
+ static double alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ extern /* Subroutine */ int zscal_(int *, doublecomplex *,
+ doublecomplex *, int *);
+ static int kstep;
+ extern /* Subroutine */ int zgemv_(char *, int *, int *,
+ doublecomplex *, doublecomplex *, int *, doublecomplex *,
+ int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+ zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+ int *), zswap_(int *, doublecomplex *, int *,
+ doublecomplex *, int *);
+ static double absakk, colmax;
+ extern int izamax_(int *, doublecomplex *, int *);
+ static double rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.) + 1.) / 8.;
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ kstep = 1;
+ i__1 = k + kw * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
+ w_dim1]), abs(d__2));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ kw * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ i__1 = k - imax;
+ jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
+ &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ jmax + (kw - 1) * w_dim1]), abs(d__2));
+ if (imax > 1) {
+ i__1 = imax - 1;
+ jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (kw - 1) * w_dim1;
+ d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+ d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs(
+ d__2));
+ rowmax = max(d__3,d__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (kw - 1) * w_dim1;
+ if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha *
+ rowmax) {
+ kp = imax;
+ zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = kk - 1 - kp;
+ zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ if (kp > 1) {
+ i__1 = kp - 1;
+ zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
+ + 1], &c__1);
+ }
+ if (k < *n) {
+ i__1 = *n - k;
+ zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
+ + 1) * a_dim1], lda);
+ }
+ i__1 = *n - kk + 1;
+ zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = z__1.r, r1.i = z__1.i;
+ i__1 = k - 1;
+ zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ z_div(&z__1, &w[k + kw * w_dim1], &d21);
+ d11.r = z__1.r, d11.i = z__1.i;
+ z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+ z_div(&z__1, &c_b1, &z__2);
+ t.r = z__1.r, t.i = z__1.i;
+ z_div(&z__1, &t, &d21);
+ d21.r = z__1.r, d21.i = z__1.i;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+ .i;
+ z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+ d21.r * z__2.i + d21.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + k * a_dim1;
+ i__3 = j + kw * w_dim1;
+ z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+ .i;
+ z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+ d21.r * z__2.i + d21.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ ++j;
+ }
+ ++j;
+ if (jp != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
+ }
+ if (j < *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k
+ + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
+ kstep = 1;
+ i__1 = k + k * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
+ w_dim1]), abs(d__2));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ k * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ } else {
+ if (absakk >= alpha * colmax) {
+ kp = k;
+ } else {
+ i__1 = imax - k;
+ zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1],
+ lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
+ w_dim1], &c__1, (ftnlen)12);
+ i__1 = imax - k;
+ jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
+ ;
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ jmax + (k + 1) * w_dim1]), abs(d__2));
+ if (imax < *n) {
+ i__1 = *n - imax;
+ jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+/* Computing MAX */
+ i__1 = jmax + (k + 1) * w_dim1;
+ d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + (
+ d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs(
+ d__2));
+ rowmax = max(d__3,d__4);
+ }
+ if (absakk >= alpha * colmax * (colmax / rowmax)) {
+ kp = k;
+ } else /* if(complicated condition) */ {
+ i__1 = imax + (k + 1) * w_dim1;
+ if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha *
+ rowmax) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
+ k * w_dim1], &c__1);
+ } else {
+ kp = imax;
+ kstep = 2;
+ }
+ }
+ }
+ kk = k + kstep - 1;
+ if (kp != kk) {
+ i__1 = kp + kp * a_dim1;
+ i__2 = kk + kk * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = kp - kk - 1;
+ zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
+ 1) * a_dim1], lda);
+ if (kp < *n) {
+ i__1 = *n - kp;
+ zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
+ + kp * a_dim1], &c__1);
+ }
+ if (k > 1) {
+ i__1 = k - 1;
+ zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ }
+ zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = z__1.r, r1.i = z__1.i;
+ i__1 = *n - k;
+ zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = z__1.r, d11.i = z__1.i;
+ z_div(&z__1, &w[k + k * w_dim1], &d21);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+ z_div(&z__1, &c_b1, &z__2);
+ t.r = z__1.r, t.i = z__1.i;
+ z_div(&z__1, &t, &d21);
+ d21.r = z__1.r, d21.i = z__1.i;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ i__3 = j + k * w_dim1;
+ z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+ .i;
+ z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+ d21.r * z__2.i + d21.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4]
+ .i;
+ z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i =
+ d21.r * z__2.i + d21.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -kp;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ jj = j;
+ jp = ipiv[j];
+ if (jp < 0) {
+ jp = -jp;
+ --j;
+ }
+ --j;
+ if (jp != jj && j >= 1) {
+ zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j > 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c
new file mode 100644
index 000000000..c598f7b1e
--- /dev/null
+++ b/relapack/src/zsytrf_rook.c
@@ -0,0 +1,236 @@
+#include "relapack.h"
+#if XSYTRF_ALLOW_MALLOC
+#include <stdlib.h>
+#endif
+
+static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *,
+ double *, const int *, int *, double *, const int *, int *);
+
+
+/** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+ *
+ * This routine is functionally equivalent to LAPACK's zsytrf_rook.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html
+ * */
+void RELAPACK_zsytrf_rook(
+ const char *uplo, const int *n,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *lWork, int *info
+) {
+
+ // Required work size
+ const int cleanlWork = *n * (*n / 2);
+ int minlWork = cleanlWork;
+#if XSYTRF_ALLOW_MALLOC
+ minlWork = 1;
+#endif
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (*n < 0)
+ *info = -2;
+ else if (*ldA < MAX(1, *n))
+ *info = -4;
+ else if (*lWork < minlWork && *lWork != -1)
+ *info = -7;
+ else if (*lWork == -1) {
+ // Work size query
+ *Work = cleanlWork;
+ return;
+ }
+
+ // Ensure Work size
+ double *cleanWork = Work;
+#if XSYTRF_ALLOW_MALLOC
+ if (!*info && *lWork < cleanlWork) {
+ cleanWork = malloc(cleanlWork * 2 * sizeof(double));
+ if (!cleanWork)
+ *info = -7;
+ }
+#endif
+
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZSYTRF", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+
+ // Dummy argument
+ int nout;
+
+ // Recursive kernel
+ RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);
+
+#if XSYTRF_ALLOW_MALLOC
+ if (cleanWork != Work)
+ free(cleanWork);
+#endif
+}
+
+
+/** zsytrf_rook's recursive compute kernel */
+static void RELAPACK_zsytrf_rook_rec(
+ const char *uplo, const int *n_full, const int *n, int *n_out,
+ double *A, const int *ldA, int *ipiv,
+ double *Work, const int *ldWork, int *info
+) {
+
+ // top recursion level?
+ const int top = *n_full == *n;
+
+ if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) {
+ // Unblocked
+ if (top) {
+ LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info);
+ *n_out = *n;
+ } else
+ RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
+ return;
+ }
+
+ int info1, info2;
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ const int n_rest = *n_full - *n;
+
+ if (*uplo == 'L') {
+ // Splitting (setup)
+ int n1 = ZREC_SPLIT(*n);
+ int n2 = *n - n1;
+
+ // Work_L *
+ double *const Work_L = Work;
+
+ // recursion(A_L)
+ int n1_out;
+ RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
+ n1 = n1_out;
+
+ // Splitting (continued)
+ n2 = *n - n1;
+ const int n_full2 = *n_full - n1;
+
+ // * *
+ // A_BL A_BR
+ // A_BL_B A_BR_B
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+ double *const A_BL_B = A + 2 * *n;
+ double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;
+
+ // * *
+ // Work_BL Work_BR
+ // * *
+ // (top recursion level: use Work as Work_BR)
+ double *const Work_BL = Work + 2 * n1;
+ double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
+ const int ldWork_BR = top ? n2 : *ldWork;
+
+ // ipiv_T
+ // ipiv_B
+ int *const ipiv_B = ipiv + n1;
+
+ // A_BR = A_BR - A_BL Work_BL'
+ RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);
+
+ // recursion(A_BR)
+ int n2_out;
+ RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);
+
+ if (n2_out != n2) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // last column of A_BR
+ double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;
+
+ // last row of A_BL
+ double *const A_BL_b = A_BL + 2 * n2_out;
+
+ // last row of Work_BL
+ double *const Work_BL_b = Work_BL + 2 * n2_out;
+
+ // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
+ BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
+ }
+ n2 = n2_out;
+
+ // shift pivots
+ int i;
+ for (i = 0; i < n2; i++)
+ if (ipiv_B[i] > 0)
+ ipiv_B[i] += n1;
+ else
+ ipiv_B[i] -= n1;
+
+ *info = info1 || info2;
+ *n_out = n1 + n2;
+ } else {
+ // Splitting (setup)
+ int n2 = ZREC_SPLIT(*n);
+ int n1 = *n - n2;
+
+ // * Work_R
+ // (top recursion level: use Work as Work_R)
+ double *const Work_R = top ? Work : Work + 2 * *ldWork * n1;
+
+ // recursion(A_R)
+ int n2_out;
+ RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
+ const int n2_diff = n2 - n2_out;
+ n2 = n2_out;
+
+ // Splitting (continued)
+ n1 = *n - n2;
+ const int n_full1 = *n_full - n2;
+
+ // * A_TL_T A_TR_T
+ // * A_TL A_TR
+ // * * *
+ double *const A_TL_T = A + 2 * *ldA * n_rest;
+ double *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
+ double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest;
+ double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;
+
+ // Work_L *
+ // * Work_TR
+ // * *
+ // (top recursion level: Work_R was Work)
+ double *const Work_L = Work;
+ double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
+ const int ldWork_L = top ? n1 : *ldWork;
+
+ // A_TL = A_TL - A_TR Work_TR'
+ RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
+ BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);
+
+ // recursion(A_TL)
+ int n1_out;
+ RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);
+
+ if (n1_out != n1) {
+ // undo 1 column of updates
+ const int n_restp1 = n_rest + 1;
+
+ // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
+ BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
+ }
+ n1 = n1_out;
+
+ *info = info2 || info1;
+ *n_out = n1 + n2;
+ }
+}
diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c
new file mode 100644
index 000000000..9e111fe0c
--- /dev/null
+++ b/relapack/src/zsytrf_rook_rec2.c
@@ -0,0 +1,561 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "f2c.h"
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static int c__1 = 1;
+
+/** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
+ *
+ * This routine is a minor modification of LAPACK's zlasyf_rook.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * The blocked BLAS Level 3 updates were removed and moved to the
+ * recursive algorithm.
+ * */
+/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n,
+ int *nb, int *kb, doublecomplex *a, int *lda, int *
+ ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
+ double d__1, d__2;
+ doublecomplex z__1, z__2, z__3, z__4;
+
+ /* Builtin functions */
+ double sqrt(double), d_imag(doublecomplex *);
+ void z_div(doublecomplex *, doublecomplex *, doublecomplex *);
+
+ /* Local variables */
+ static int j, k, p;
+ static doublecomplex t, r1, d11, d12, d21, d22;
+ static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
+ static logical done;
+ static int imax, jmax;
+ static double alpha;
+ extern logical lsame_(char *, char *, ftnlen, ftnlen);
+ static double dtemp, sfmin;
+ extern /* Subroutine */ int zscal_(int *, doublecomplex *,
+ doublecomplex *, int *);
+ static int itemp, kstep;
+ extern /* Subroutine */ int zgemv_(char *, int *, int *,
+ doublecomplex *, doublecomplex *, int *, doublecomplex *,
+ int *, doublecomplex *, doublecomplex *, int *, ftnlen),
+ zcopy_(int *, doublecomplex *, int *, doublecomplex *,
+ int *), zswap_(int *, doublecomplex *, int *,
+ doublecomplex *, int *);
+ extern double dlamch_(char *, ftnlen);
+ static double absakk, colmax;
+ extern int izamax_(int *, doublecomplex *, int *);
+ static double rowmax;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ --ipiv;
+ w_dim1 = *ldw;
+ w_offset = 1 + w_dim1;
+ w -= w_offset;
+
+ /* Function Body */
+ *info = 0;
+ alpha = (sqrt(17.) + 1.) / 8.;
+ sfmin = dlamch_("S", (ftnlen)1);
+ if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+ k = *n;
+L10:
+ kw = *nb + k - *n;
+ if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
+ goto L30;
+ }
+ kstep = 1;
+ p = k;
+ zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1],
+ lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
+ w_dim1 + 1], &c__1, (ftnlen)12);
+ }
+ i__1 = k + kw * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw *
+ w_dim1]), abs(d__2));
+ if (k > 1) {
+ i__1 = k - 1;
+ imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
+ i__1 = imax + kw * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ kw * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L12:
+ zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
+ w_dim1 + 1], &c__1);
+ i__1 = k - imax;
+ zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
+ 1 + (kw - 1) * w_dim1], &c__1);
+ if (k < *n) {
+ i__1 = *n - k;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) *
+ a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
+ ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
+ ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = k - imax;
+ jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) *
+ w_dim1], &c__1);
+ i__1 = jmax + (kw - 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+ w[jmax + (kw - 1) * w_dim1]), abs(d__2));
+ } else {
+ rowmax = 0.;
+ }
+ if (imax > 1) {
+ i__1 = imax - 1;
+ itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
+ i__1 = itemp + (kw - 1) * w_dim1;
+ dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ itemp + (kw - 1) * w_dim1]), abs(d__2));
+ if (dtemp > rowmax) {
+ rowmax = dtemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (kw - 1) * w_dim1;
+ if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+ + (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
+ kp = imax;
+ zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
+ w_dim1 + 1], &c__1);
+ }
+ if (! done) {
+ goto L12;
+ }
+ }
+ kk = k - kstep + 1;
+ kkw = *nb + kk - *n;
+ if (kstep == 2 && p != k) {
+ i__1 = k - p;
+ zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
+ a_dim1], lda);
+ zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - k + 1;
+ zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
+ ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + k * a_dim1;
+ i__2 = kk + k * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = k - 1 - kp;
+ zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
+ 1) * a_dim1], lda);
+ zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
+ c__1);
+ i__1 = *n - kk + 1;
+ zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
+ lda);
+ i__1 = *n - kk + 1;
+ zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
+ w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
+ c__1);
+ if (k > 1) {
+ i__1 = k + k * a_dim1;
+ if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
+ k * a_dim1]), abs(d__2)) >= sfmin) {
+ z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = z__1.r, r1.i = z__1.i;
+ i__1 = k - 1;
+ zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
+ } else /* if(complicated condition) */ {
+ i__1 = k + k * a_dim1;
+ if (a[i__1].r != 0. || a[i__1].i != 0.) {
+ i__1 = k - 1;
+ for (ii = 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
+ a_dim1]);
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L14: */
+ }
+ }
+ }
+ }
+ } else {
+ if (k > 2) {
+ i__1 = k - 1 + kw * w_dim1;
+ d12.r = w[i__1].r, d12.i = w[i__1].i;
+ z_div(&z__1, &w[k + kw * w_dim1], &d12);
+ d11.r = z__1.r, d11.i = z__1.i;
+ z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+ z_div(&z__1, &c_b1, &z__2);
+ t.r = z__1.r, t.i = z__1.i;
+ i__1 = k - 2;
+ for (j = 1; j <= i__1; ++j) {
+ i__2 = j + (k - 1) * a_dim1;
+ i__3 = j + (kw - 1) * w_dim1;
+ z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + kw * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z_div(&z__2, &z__3, &d12);
+ z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+ z__2.i + t.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + k * a_dim1;
+ i__3 = j + kw * w_dim1;
+ z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + (kw - 1) * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z_div(&z__2, &z__3, &d12);
+ z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+ z__2.i + t.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L20: */
+ }
+ }
+ i__1 = k - 1 + (k - 1) * a_dim1;
+ i__2 = k - 1 + (kw - 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k - 1 + k * a_dim1;
+ i__2 = k - 1 + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + k * a_dim1;
+ i__2 = k + kw * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k - 1] = -kp;
+ }
+ k -= kstep;
+ goto L10;
+L30:
+ j = k + 1;
+L60:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ ++j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ ++j;
+ if (jp2 != jj && j <= *n) {
+ i__1 = *n - j + 1;
+ zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ jj = j - 1;
+ if (jp1 != jj && kstep == 2) {
+ i__1 = *n - j + 1;
+ zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
+ ;
+ }
+ if (j <= *n) {
+ goto L60;
+ }
+ *kb = *n - k;
+ } else {
+ k = 1;
+L70:
+ if ((k >= *nb && *nb < *n) || k > *n) {
+ goto L90;
+ }
+ kstep = 1;
+ p = k;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &
+ w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
+ ftnlen)12);
+ }
+ i__1 = k + k * w_dim1;
+ absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k *
+ w_dim1]), abs(d__2));
+ if (k < *n) {
+ i__1 = *n - k;
+ imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
+ i__1 = imax + k * w_dim1;
+ colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax +
+ k * w_dim1]), abs(d__2));
+ } else {
+ colmax = 0.;
+ }
+ if (max(absakk,colmax) == 0.) {
+ if (*info == 0) {
+ *info = k;
+ }
+ kp = k;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ } else {
+ if (! (absakk < alpha * colmax)) {
+ kp = k;
+ } else {
+ done = FALSE_;
+L72:
+ i__1 = imax - k;
+ zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = *n - imax + 1;
+ zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
+ 1) * w_dim1], &c__1);
+ if (k > 1) {
+ i__1 = *n - k + 1;
+ i__2 = k - 1;
+ z__1.r = -1., z__1.i = -0.;
+ zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1]
+ , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
+ 1) * w_dim1], &c__1, (ftnlen)12);
+ }
+ if (imax != k) {
+ i__1 = imax - k;
+ jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &
+ c__1);
+ i__1 = jmax + (k + 1) * w_dim1;
+ rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&
+ w[jmax + (k + 1) * w_dim1]), abs(d__2));
+ } else {
+ rowmax = 0.;
+ }
+ if (imax < *n) {
+ i__1 = *n - imax;
+ itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) *
+ w_dim1], &c__1);
+ i__1 = itemp + (k + 1) * w_dim1;
+ dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[
+ itemp + (k + 1) * w_dim1]), abs(d__2));
+ if (dtemp > rowmax) {
+ rowmax = dtemp;
+ jmax = itemp;
+ }
+ }
+ i__1 = imax + (k + 1) * w_dim1;
+ if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax
+ + (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) {
+ kp = imax;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ done = TRUE_;
+ } else if (p == jmax || rowmax <= colmax) {
+ kp = imax;
+ kstep = 2;
+ done = TRUE_;
+ } else {
+ p = imax;
+ colmax = rowmax;
+ imax = jmax;
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
+ w_dim1], &c__1);
+ }
+ if (! done) {
+ goto L72;
+ }
+ }
+ kk = k + kstep - 1;
+ if (kstep == 2 && p != k) {
+ i__1 = p - k;
+ zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
+ lda);
+ i__1 = *n - p + 1;
+ zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
+ c__1);
+ zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
+ zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
+ }
+ if (kp != kk) {
+ i__1 = kp + k * a_dim1;
+ i__2 = kk + k * a_dim1;
+ a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
+ i__1 = kp - k - 1;
+ zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
+ * a_dim1], lda);
+ i__1 = *n - kp + 1;
+ zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
+ a_dim1], &c__1);
+ zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
+ zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
+ }
+ if (kstep == 1) {
+ i__1 = *n - k + 1;
+ zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
+ c__1);
+ if (k < *n) {
+ i__1 = k + k * a_dim1;
+ if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k +
+ k * a_dim1]), abs(d__2)) >= sfmin) {
+ z_div(&z__1, &c_b1, &a[k + k * a_dim1]);
+ r1.r = z__1.r, r1.i = z__1.i;
+ i__1 = *n - k;
+ zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
+ } else /* if(complicated condition) */ {
+ i__1 = k + k * a_dim1;
+ if (a[i__1].r != 0. || a[i__1].i != 0.) {
+ i__1 = *n;
+ for (ii = k + 1; ii <= i__1; ++ii) {
+ i__2 = ii + k * a_dim1;
+ z_div(&z__1, &a[ii + k * a_dim1], &a[k + k *
+ a_dim1]);
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L74: */
+ }
+ }
+ }
+ }
+ } else {
+ if (k < *n - 1) {
+ i__1 = k + 1 + k * w_dim1;
+ d21.r = w[i__1].r, d21.i = w[i__1].i;
+ z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
+ d11.r = z__1.r, d11.i = z__1.i;
+ z_div(&z__1, &w[k + k * w_dim1], &d21);
+ d22.r = z__1.r, d22.i = z__1.i;
+ z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r *
+ d22.i + d11.i * d22.r;
+ z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.;
+ z_div(&z__1, &c_b1, &z__2);
+ t.r = z__1.r, t.i = z__1.i;
+ i__1 = *n;
+ for (j = k + 2; j <= i__1; ++j) {
+ i__2 = j + k * a_dim1;
+ i__3 = j + k * w_dim1;
+ z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
+ z__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
+ .r;
+ i__4 = j + (k + 1) * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z_div(&z__2, &z__3, &d21);
+ z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+ z__2.i + t.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+ i__2 = j + (k + 1) * a_dim1;
+ i__3 = j + (k + 1) * w_dim1;
+ z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
+ z__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
+ .r;
+ i__4 = j + k * w_dim1;
+ z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4]
+ .i;
+ z_div(&z__2, &z__3, &d21);
+ z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r *
+ z__2.i + t.i * z__2.r;
+ a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+/* L80: */
+ }
+ }
+ i__1 = k + k * a_dim1;
+ i__2 = k + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + k * a_dim1;
+ i__2 = k + 1 + k * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ i__1 = k + 1 + (k + 1) * a_dim1;
+ i__2 = k + 1 + (k + 1) * w_dim1;
+ a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
+ }
+ }
+ if (kstep == 1) {
+ ipiv[k] = kp;
+ } else {
+ ipiv[k] = -p;
+ ipiv[k + 1] = -kp;
+ }
+ k += kstep;
+ goto L70;
+L90:
+ j = k - 1;
+L120:
+ kstep = 1;
+ jp1 = 1;
+ jj = j;
+ jp2 = ipiv[j];
+ if (jp2 < 0) {
+ jp2 = -jp2;
+ --j;
+ jp1 = -ipiv[j];
+ kstep = 2;
+ }
+ --j;
+ if (jp2 != jj && j >= 1) {
+ zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ jj = j + 1;
+ if (jp1 != jj && kstep == 2) {
+ zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
+ }
+ if (j >= 1) {
+ goto L120;
+ }
+ *kb = k - 1;
+ }
+ return;
+}
diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c
new file mode 100644
index 000000000..2c8a35256
--- /dev/null
+++ b/relapack/src/ztgsyl.c
@@ -0,0 +1,268 @@
+#include "relapack.h"
+#include <math.h>
+
+static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *,
+ const int *, const double *, const int *, const double *, const int *,
+ double *, const int *, const double *, const int *, const double *,
+ const int *, double *, const int *, double *, double *, double *, int *);
+
+
+/** ZTGSYL solves the generalized Sylvester equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ztgsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html
+ * */
+void RELAPACK_ztgsyl(
+ const char *trans, const int *ijob, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dif,
+ double *Work, const int *lWork, int *iWork, int *info
+) {
+
+ // Parse arguments
+ const int notran = LAPACK(lsame)(trans, "N");
+ const int tran = LAPACK(lsame)(trans, "C");
+
+ // Compute work buffer size
+ int lwmin = 1;
+ if (notran && (*ijob == 1 || *ijob == 2))
+ lwmin = MAX(1, 2 * *m * *n);
+ *info = 0;
+
+ // Check arguments
+ if (!tran && !notran)
+ *info = -1;
+ else if (notran && (*ijob < 0 || *ijob > 4))
+ *info = -2;
+ else if (*m <= 0)
+ *info = -3;
+ else if (*n <= 0)
+ *info = -4;
+ else if (*ldA < MAX(1, *m))
+ *info = -6;
+ else if (*ldB < MAX(1, *n))
+ *info = -8;
+ else if (*ldC < MAX(1, *m))
+ *info = -10;
+ else if (*ldD < MAX(1, *m))
+ *info = -12;
+ else if (*ldE < MAX(1, *n))
+ *info = -14;
+ else if (*ldF < MAX(1, *m))
+ *info = -16;
+ else if (*lWork < lwmin && *lWork != -1)
+ *info = -20;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZTGSYL", &minfo);
+ return;
+ }
+
+ if (*lWork == -1) {
+ // Work size query
+ *Work = lwmin;
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantrans = notran ? 'N' : 'C';
+
+ // Constant
+ const double ZERO[] = { 0., 0. };
+
+ int isolve = 1;
+ int ifunc = 0;
+ if (notran) {
+ if (*ijob >= 3) {
+ ifunc = *ijob - 2;
+ LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else if (*ijob >= 1)
+ isolve = 2;
+ }
+
+ double scale2;
+ int iround;
+ for (iround = 1; iround <= isolve; iround++) {
+ *scale = 1;
+ double dscale = 0;
+ double dsum = 1;
+ RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
+ if (dscale != 0) {
+ if (*ijob == 1 || *ijob == 3)
+ *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
+ else
+ *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
+ }
+ if (isolve == 2) {
+ if (iround == 1) {
+ if (notran)
+ ifunc = *ijob;
+ scale2 = *scale;
+ LAPACK(zlacpy)("F", m, n, C, ldC, Work, m);
+ LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
+ LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC);
+ LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF);
+ } else {
+ LAPACK(zlacpy)("F", m, n, Work, m, C, ldC);
+ LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
+ *scale = scale2;
+ }
+ }
+ }
+}
+
+
+/** ztgsyl's recursive vompute kernel */
+static void RELAPACK_ztgsyl_rec(
+ const char *trans, const int *ifunc, const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC,
+ const double *D, const int *ldD, const double *E, const int *ldE,
+ double *F, const int *ldF,
+ double *scale, double *dsum, double *dscale,
+ int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) {
+ // Unblocked
+ LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ double scale1[] = { 1., 0. };
+ double scale2[] = { 1., 0. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ const int m1 = ZREC_SPLIT(*m);
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const double *const A_TL = A;
+ const double *const A_TR = A + 2 * *ldA * m1;
+ const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+ // C_T
+ // C_B
+ double *const C_T = C;
+ double *const C_B = C + 2 * m1;
+
+ // D_TL D_TR
+ // 0 D_BR
+ const double *const D_TL = D;
+ const double *const D_TR = D + 2 * *ldD * m1;
+ const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1;
+
+ // F_T
+ // F_B
+ double *const F_T = F;
+ double *const F_B = F + 2 * m1;
+
+ if (*trans == 'N') {
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // F_T = F_T - D_TR * C_B
+ BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
+ }
+ } else {
+ // recursion(A_TL, B, C_T, D_TL, E, F_T)
+ RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
+ // C_B = C_B - A_TR^H * C_T
+ BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // C_B = C_B - D_TR^H * F_T
+ BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
+ // recursion(A_BR, B, C_B, D_BR, E, F_B)
+ RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
+ }
+ }
+ } else {
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const double *const B_TL = B;
+ const double *const B_TR = B + 2 * *ldB * n1;
+ const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+ // C_L C_R
+ double *const C_L = C;
+ double *const C_R = C + 2 * *ldC * n1;
+
+ // E_TL E_TR
+ // 0 E_BR
+ const double *const E_TL = E;
+ const double *const E_TR = E + 2 * *ldE * n1;
+ const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1;
+
+ // F_L F_R
+ double *const F_L = F;
+ double *const F_R = F + 2 * *ldF * n1;
+
+ if (*trans == 'N') {
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
+ // C_R = C_R + F_L * B_TR
+ BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
+ // F_R = F_R + F_L * E_TR
+ BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
+ }
+ } else {
+ // recursion(A, B_BR, C_R, D, E_BR, F_R)
+ RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
+ // apply scale
+ if (scale1[0] != 1)
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
+ // F_L = F_L + C_R * B_TR
+ BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
+ // F_L = F_L + F_R * E_TR
+ BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
+ // recursion(A, B_TL, C_L, D, E_TL, F_L)
+ RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
+ // apply scale
+ if (scale2[0] != 1) {
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
+ }
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c
new file mode 100644
index 000000000..82b2c8803
--- /dev/null
+++ b/relapack/src/ztrsyl.c
@@ -0,0 +1,163 @@
+#include "relapack.h"
+
+static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *,
+ const int *, const int *, const double *, const int *, const double *,
+ const int *, double *, const int *, double *, int *);
+
+
+/** ZTRSYL solves the complex Sylvester matrix equation.
+ *
+ * This routine is functionally equivalent to LAPACK's ztrsyl.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html
+ * */
+void RELAPACK_ztrsyl(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+
+ // Check arguments
+ const int notransA = LAPACK(lsame)(tranA, "N");
+ const int ctransA = LAPACK(lsame)(tranA, "C");
+ const int notransB = LAPACK(lsame)(tranB, "N");
+ const int ctransB = LAPACK(lsame)(tranB, "C");
+ *info = 0;
+ if (!ctransA && !notransA)
+ *info = -1;
+ else if (!ctransB && !notransB)
+ *info = -2;
+ else if (*isgn != 1 && *isgn != -1)
+ *info = -3;
+ else if (*m < 0)
+ *info = -4;
+ else if (*n < 0)
+ *info = -5;
+ else if (*ldA < MAX(1, *m))
+ *info = -7;
+ else if (*ldB < MAX(1, *n))
+ *info = -9;
+ else if (*ldC < MAX(1, *m))
+ *info = -11;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZTRSYL", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleantranA = notransA ? 'N' : 'C';
+ const char cleantranB = notransB ? 'N' : 'C';
+
+ // Recursive kernel
+ RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+}
+
+
+/** ztrsyl's recursive compute kernel */
+static void RELAPACK_ztrsyl_rec(
+ const char *tranA, const char *tranB, const int *isgn,
+ const int *m, const int *n,
+ const double *A, const int *ldA, const double *B, const int *ldB,
+ double *C, const int *ldC, double *scale,
+ int *info
+) {
+
+ if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) {
+ // Unblocked
+ RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1., 0. };
+ const double MONE[] = { -1., 0. };
+ const double MSGN[] = { -*isgn, 0. };
+ const int iONE[] = { 1 };
+
+ // Outputs
+ double scale1[] = { 1., 0. };
+ double scale2[] = { 1., 0. };
+ int info1[] = { 0 };
+ int info2[] = { 0 };
+
+ if (*m > *n) {
+ // Splitting
+ const int m1 = ZREC_SPLIT(*m);
+ const int m2 = *m - m1;
+
+ // A_TL A_TR
+ // 0 A_BR
+ const double *const A_TL = A;
+ const double *const A_TR = A + 2 * *ldA * m1;
+ const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1;
+
+ // C_T
+ // C_B
+ double *const C_T = C;
+ double *const C_B = C + 2 * m1;
+
+ if (*tranA == 'N') {
+ // recusion(A_BR, B, C_B)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
+ // C_T = C_T - A_TR * C_B
+ BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
+ // recusion(A_TL, B, C_T)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
+ } else {
+ // recusion(A_TL, B, C_T)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
+ // C_B = C_B - A_TR' * C_T
+ BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
+ // recusion(A_BR, B, C_B)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
+ }
+ } else {
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // B_TL B_TR
+ // 0 B_BR
+ const double *const B_TL = B;
+ const double *const B_TR = B + 2 * *ldB * n1;
+ const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1;
+
+ // C_L C_R
+ double *const C_L = C;
+ double *const C_R = C + 2 * *ldC * n1;
+
+ if (*tranB == 'N') {
+ // recusion(A, B_TL, C_L)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
+ // C_R = C_R -/+ C_L * B_TR
+ BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
+ // recusion(A, B_BR, C_R)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
+ } else {
+ // recusion(A, B_BR, C_R)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
+ // C_L = C_L -/+ C_R * B_TR'
+ BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
+ // recusion(A, B_TL, C_L)
+ RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
+ // apply scale
+ if (scale2[0] != 1)
+ LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
+ }
+ }
+
+ *scale = scale1[0] * scale2[0];
+ *info = info1[0] || info2[0];
+}
diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c
new file mode 100644
index 000000000..526ab097c
--- /dev/null
+++ b/relapack/src/ztrsyl_rec2.c
@@ -0,0 +1,394 @@
+/* -- translated by f2c (version 20100827).
+ You must link the resulting object file with libf2c:
+ on Microsoft Windows system, link with libf2c.lib;
+ on Linux or Unix systems, link with .../path/to/libf2c.a -lm
+ or, if you install libf2c.a in a standard place, with -lf2c -lm
+ -- in that order, at the end of the command line, as in
+ cc *.o -lf2c -lm
+ Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
+
+ http://www.netlib.org/f2c/libf2c.zip
+*/
+
+#include "../config.h"
+#include "f2c.h"
+
+#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
+ extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
+ doublecomplex result;
+ zdotu_(&result, n, x, incx, y, incy);
+ return result;
+}
+#define zdotu_ zdotu_fun
+
+doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) {
+ extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *);
+ doublecomplex result;
+ zdotc_(&result, n, x, incx, y, incy);
+ return result;
+}
+#define zdotc_ zdotc_fun
+#endif
+
+#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
+doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) {
+ extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *);
+ doublecomplex result;
+ zladiv_(&result, a, b);
+ return result;
+}
+#define zladiv_ zladiv_fun
+#endif
+
+/* Table of constant values */
+
+static int c__1 = 1;
+
+/** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
+ *
+ * This routine is an exact copy of LAPACK's ztrsyl.
+ * It serves as an unblocked kernel in the recursive algorithms.
+ * */
+/* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int
+ *isgn, int *m, int *n, doublecomplex *a, int *lda,
+ doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc,
+ double *scale, int *info, ftnlen trana_len, ftnlen tranb_len)
+{
+ /* System generated locals */
+ int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
+ i__3, i__4;
+ double d__1, d__2;
+ doublecomplex z__1, z__2, z__3, z__4;
+
+ /* Builtin functions */
+ double d_imag(doublecomplex *);
+ void d_cnjg(doublecomplex *, doublecomplex *);
+
+ /* Local variables */
+ static int j, k, l;
+ static doublecomplex a11;
+ static double db;
+ static doublecomplex x11;
+ static double da11;
+ static doublecomplex vec;
+ static double dum[1], eps, sgn, smin;
+ static doublecomplex suml, sumr;
+ extern int lsame_(char *, char *, ftnlen, ftnlen);
+ /* Double Complex */ doublecomplex zdotc_(int *,
+ doublecomplex *, int *, doublecomplex *, int *), zdotu_(
+ int *, doublecomplex *, int *,
+ doublecomplex *, int *);
+ extern /* Subroutine */ int dlabad_(double *, double *);
+ extern double dlamch_(char *, ftnlen);
+ static double scaloc;
+ extern /* Subroutine */ int xerbla_(char *, int *, ftnlen);
+ extern double zlange_(char *, int *, int *, doublecomplex *,
+ int *, double *, ftnlen);
+ static double bignum;
+ extern /* Subroutine */ int zdscal_(int *, double *,
+ doublecomplex *, int *);
+ /* Double Complex */ doublecomplex zladiv_(doublecomplex *,
+ doublecomplex *);
+ static int notrna, notrnb;
+ static double smlnum;
+
+ /* Parameter adjustments */
+ a_dim1 = *lda;
+ a_offset = 1 + a_dim1;
+ a -= a_offset;
+ b_dim1 = *ldb;
+ b_offset = 1 + b_dim1;
+ b -= b_offset;
+ c_dim1 = *ldc;
+ c_offset = 1 + c_dim1;
+ c__ -= c_offset;
+
+ /* Function Body */
+ notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
+ notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
+ *info = 0;
+ if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -1;
+ } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
+ *info = -2;
+ } else if (*isgn != 1 && *isgn != -1) {
+ *info = -3;
+ } else if (*m < 0) {
+ *info = -4;
+ } else if (*n < 0) {
+ *info = -5;
+ } else if (*lda < max(1,*m)) {
+ *info = -7;
+ } else if (*ldb < max(1,*n)) {
+ *info = -9;
+ } else if (*ldc < max(1,*m)) {
+ *info = -11;
+ }
+ if (*info != 0) {
+ i__1 = -(*info);
+ xerbla_("ZTRSY2", &i__1, (ftnlen)6);
+ return;
+ }
+ *scale = 1.;
+ if (*m == 0 || *n == 0) {
+ return;
+ }
+ eps = dlamch_("P", (ftnlen)1);
+ smlnum = dlamch_("S", (ftnlen)1);
+ bignum = 1. / smlnum;
+ dlabad_(&smlnum, &bignum);
+ smlnum = smlnum * (double) (*m * *n) / eps;
+ bignum = 1. / smlnum;
+/* Computing MAX */
+ d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, (
+ ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n,
+ &b[b_offset], ldb, dum, (ftnlen)1);
+ smin = max(d__1,d__2);
+ sgn = (double) (*isgn);
+ if (notrna && notrnb) {
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ for (k = *m; k >= 1; --k) {
+ i__2 = *m - k;
+/* Computing MIN */
+ i__3 = k + 1;
+/* Computing MIN */
+ i__4 = k + 1;
+ z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
+ min(i__4,*m) + l * c_dim1], &c__1);
+ suml.r = z__1.r, suml.i = z__1.i;
+ i__2 = l - 1;
+ z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+ , &c__1);
+ sumr.r = z__1.r, sumr.i = z__1.i;
+ i__2 = k + l * c_dim1;
+ z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
+ z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+ z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
+ vec.r = z__1.r, vec.i = z__1.i;
+ scaloc = 1.;
+ i__2 = k + k * a_dim1;
+ i__3 = l + l * b_dim1;
+ z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i;
+ z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i;
+ a11.r = z__1.r, a11.i = z__1.i;
+ da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+ d__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+ d__2));
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ z__3.r = scaloc, z__3.i = 0.;
+ z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+ z__3.i + vec.i * z__3.r;
+ z__1 = zladiv_(&z__2, &a11);
+ x11.r = z__1.r, x11.i = z__1.i;
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L10: */
+ }
+ *scale *= scaloc;
+ }
+ i__2 = k + l * c_dim1;
+ c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L20: */
+ }
+/* L30: */
+ }
+ } else if (! notrna && notrnb) {
+ i__1 = *n;
+ for (l = 1; l <= i__1; ++l) {
+ i__2 = *m;
+ for (k = 1; k <= i__2; ++k) {
+ i__3 = k - 1;
+ z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
+ c_dim1 + 1], &c__1);
+ suml.r = z__1.r, suml.i = z__1.i;
+ i__3 = l - 1;
+ z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
+ , &c__1);
+ sumr.r = z__1.r, sumr.i = z__1.i;
+ i__3 = k + l * c_dim1;
+ z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i;
+ z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+ z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i;
+ vec.r = z__1.r, vec.i = z__1.i;
+ scaloc = 1.;
+ d_cnjg(&z__2, &a[k + k * a_dim1]);
+ i__3 = l + l * b_dim1;
+ z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
+ z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+ a11.r = z__1.r, a11.i = z__1.i;
+ da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+ d__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+ d__2));
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ z__3.r = scaloc, z__3.i = 0.;
+ z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+ z__3.i + vec.i * z__3.r;
+ z__1 = zladiv_(&z__2, &a11);
+ x11.r = z__1.r, x11.i = z__1.i;
+ if (scaloc != 1.) {
+ i__3 = *n;
+ for (j = 1; j <= i__3; ++j) {
+ zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L40: */
+ }
+ *scale *= scaloc;
+ }
+ i__3 = k + l * c_dim1;
+ c__[i__3].r = x11.r, c__[i__3].i = x11.i;
+/* L50: */
+ }
+/* L60: */
+ }
+ } else if (! notrna && ! notrnb) {
+ for (l = *n; l >= 1; --l) {
+ i__1 = *m;
+ for (k = 1; k <= i__1; ++k) {
+ i__2 = k - 1;
+ z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
+ c_dim1 + 1], &c__1);
+ suml.r = z__1.r, suml.i = z__1.i;
+ i__2 = *n - l;
+/* Computing MIN */
+ i__3 = l + 1;
+/* Computing MIN */
+ i__4 = l + 1;
+ z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
+ l + min(i__4,*n) * b_dim1], ldb);
+ sumr.r = z__1.r, sumr.i = z__1.i;
+ i__2 = k + l * c_dim1;
+ d_cnjg(&z__4, &sumr);
+ z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
+ z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+ z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i;
+ vec.r = z__1.r, vec.i = z__1.i;
+ scaloc = 1.;
+ i__2 = k + k * a_dim1;
+ i__3 = l + l * b_dim1;
+ z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i;
+ z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i;
+ d_cnjg(&z__1, &z__2);
+ a11.r = z__1.r, a11.i = z__1.i;
+ da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+ d__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+ d__2));
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ z__3.r = scaloc, z__3.i = 0.;
+ z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+ z__3.i + vec.i * z__3.r;
+ z__1 = zladiv_(&z__2, &a11);
+ x11.r = z__1.r, x11.i = z__1.i;
+ if (scaloc != 1.) {
+ i__2 = *n;
+ for (j = 1; j <= i__2; ++j) {
+ zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L70: */
+ }
+ *scale *= scaloc;
+ }
+ i__2 = k + l * c_dim1;
+ c__[i__2].r = x11.r, c__[i__2].i = x11.i;
+/* L80: */
+ }
+/* L90: */
+ }
+ } else if (notrna && ! notrnb) {
+ for (l = *n; l >= 1; --l) {
+ for (k = *m; k >= 1; --k) {
+ i__1 = *m - k;
+/* Computing MIN */
+ i__2 = k + 1;
+/* Computing MIN */
+ i__3 = k + 1;
+ z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
+ min(i__3,*m) + l * c_dim1], &c__1);
+ suml.r = z__1.r, suml.i = z__1.i;
+ i__1 = *n - l;
+/* Computing MIN */
+ i__2 = l + 1;
+/* Computing MIN */
+ i__3 = l + 1;
+ z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
+ l + min(i__3,*n) * b_dim1], ldb);
+ sumr.r = z__1.r, sumr.i = z__1.i;
+ i__1 = k + l * c_dim1;
+ d_cnjg(&z__4, &sumr);
+ z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i;
+ z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i;
+ z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i;
+ vec.r = z__1.r, vec.i = z__1.i;
+ scaloc = 1.;
+ i__1 = k + k * a_dim1;
+ d_cnjg(&z__3, &b[l + l * b_dim1]);
+ z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i;
+ z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i;
+ a11.r = z__1.r, a11.i = z__1.i;
+ da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs(
+ d__2));
+ if (da11 <= smin) {
+ a11.r = smin, a11.i = 0.;
+ da11 = smin;
+ *info = 1;
+ }
+ db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs(
+ d__2));
+ if (da11 < 1. && db > 1.) {
+ if (db > bignum * da11) {
+ scaloc = 1. / db;
+ }
+ }
+ z__3.r = scaloc, z__3.i = 0.;
+ z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r *
+ z__3.i + vec.i * z__3.r;
+ z__1 = zladiv_(&z__2, &a11);
+ x11.r = z__1.r, x11.i = z__1.i;
+ if (scaloc != 1.) {
+ i__1 = *n;
+ for (j = 1; j <= i__1; ++j) {
+ zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
+/* L100: */
+ }
+ *scale *= scaloc;
+ }
+ i__1 = k + l * c_dim1;
+ c__[i__1].r = x11.r, c__[i__1].i = x11.i;
+/* L110: */
+ }
+/* L120: */
+ }
+ }
+ return;
+}
diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c
new file mode 100644
index 000000000..ac9fe7bd4
--- /dev/null
+++ b/relapack/src/ztrtri.c
@@ -0,0 +1,107 @@
+#include "relapack.h"
+
+static void RELAPACK_ztrtri_rec(const char *, const char *, const int *,
+ double *, const int *, int *);
+
+
+/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
+ *
+ * This routine is functionally equivalent to LAPACK's ztrtri.
+ * For details on its interface, see
+ * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html
+ * */
+void RELAPACK_ztrtri(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+) {
+
+ // Check arguments
+ const int lower = LAPACK(lsame)(uplo, "L");
+ const int upper = LAPACK(lsame)(uplo, "U");
+ const int nounit = LAPACK(lsame)(diag, "N");
+ const int unit = LAPACK(lsame)(diag, "U");
+ *info = 0;
+ if (!lower && !upper)
+ *info = -1;
+ else if (!nounit && !unit)
+ *info = -2;
+ else if (*n < 0)
+ *info = -3;
+ else if (*ldA < MAX(1, *n))
+ *info = -5;
+ if (*info) {
+ const int minfo = -*info;
+ LAPACK(xerbla)("ZTRTRI", &minfo);
+ return;
+ }
+
+ // Clean char * arguments
+ const char cleanuplo = lower ? 'L' : 'U';
+ const char cleandiag = nounit ? 'N' : 'U';
+
+ // check for singularity
+ if (nounit) {
+ int i;
+ for (i = 0; i < *n; i++)
+ if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
+ *info = i;
+ return;
+ }
+ }
+
+ // Recursive kernel
+ RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
+}
+
+
+/** ztrtri's recursive compute kernel */
+static void RELAPACK_ztrtri_rec(
+ const char *uplo, const char *diag, const int *n,
+ double *A, const int *ldA,
+ int *info
+){
+
+ if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) {
+ // Unblocked
+ LAPACK(ztrti2)(uplo, diag, n, A, ldA, info);
+ return;
+ }
+
+ // Constants
+ const double ONE[] = { 1. };
+ const double MONE[] = { -1. };
+
+ // Splitting
+ const int n1 = ZREC_SPLIT(*n);
+ const int n2 = *n - n1;
+
+ // A_TL A_TR
+ // A_BL A_BR
+ double *const A_TL = A;
+ double *const A_TR = A + 2 * *ldA * n1;
+ double *const A_BL = A + 2 * n1;
+ double *const A_BR = A + 2 * *ldA * n1 + 2 * n1;
+
+ // recursion(A_TL)
+ RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
+ if (*info)
+ return;
+
+ if (*uplo == 'L') {
+ // A_BL = - A_BL * A_TL
+ BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
+ // A_BL = A_BR \ A_BL
+ BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
+ } else {
+ // A_TR = - A_TL * A_TR
+ BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
+ // A_TR = A_TR / A_BR
+ BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
+ }
+
+ // recursion(A_BR)
+ RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
+ if (*info)
+ *info += n1;
+}
diff --git a/relapack/test/README.md b/relapack/test/README.md
new file mode 100644
index 000000000..48434b3cf
--- /dev/null
+++ b/relapack/test/README.md
@@ -0,0 +1,48 @@
+ReLAPACK Test Suite
+===================
+This test suite compares ReLAPACK's recursive routines with LAPACK's compute
+routines in terms of accuracy: For each test-case, we execute both ReLAPACK's
+and LAPACK's routine on the same data and consider the numerical difference
+between the two solutions.
+
+This difference is computed as the maximum error across all elements of the
+routine's outputs, where the error for each element is the minimum of the
+absolute error and the relative error (with LAPACK as the reference). If the
+error is below the error bound configured in `config.h` (default: 1e-5 for
+single precision and 1e-14 for double precision) the test-case is considered as
+passed.
+
+For each routine the test-cases cover a variety of input argument combinations
+to ensure that ReLAPACK's routines match the functionality of LAPACK for all use
+cases.
+
+The matrix size for all experiments (default: 100) can also be specified in
+`config.h`.
+
+
+Implementation
+--------------
+`test.h` provides the framework for our tests: It provides macros that allow to
+generalize the tests for each operation in one file covering all data-types.
+Such a file is structured as follows:
+
+ * All matrices required by the test-cases are declared globally. For each
+ matrix, an array of two pointers is declared; one for the matrix copy passed
+ to ReLAPACK and one passed to LAPACK.
+
+ * `tests()` contains the main control flow: it allocates (and later frees) the
+ copies of the globally declared matrices. It then defines the macro
+ `ROUTINE` to contain the name of the currently tested routine.
+ It then uses the macro `TEST` to perform the test-cases.
+ It receives the arguments of the routine, where matrices of which ReLAPACK
+ and LAPACK receive a copy are index with `i`. (Example: `TEST("L", &n, A[i],
+ &n, info);`)
+
+ * The macro `TEST` first calls `pre()`, which initializes all relevant
+ matrices, then executes the ReLAPACK algorithm on the matrices with `i` = `0`
+ and then the LAPACK counter part with `i` = `1`. It then calls `post()`,
+ which computes the difference between the results, storing it in `error`.
+ Finally, the error is printed out and compared to the error bound.
+
+If all test-cases pass the error bound test, the program will have a `0` return
+value, otherwise it is `1`, indicating an error.
diff --git a/relapack/test/config.h b/relapack/test/config.h
new file mode 100644
index 000000000..ab06a2fff
--- /dev/null
+++ b/relapack/test/config.h
@@ -0,0 +1,13 @@
+#ifndef TEST_CONFIG_H
+#define TEST_CONFIG_H
+
+// error bound for single and single complex routines
+#define SINGLE_ERR_BOUND 1e-4
+
+// error bound for double an double complex routines
+#define DOUBLE_ERR_BOUND 1e-13
+
+// size of test matrices
+#define TEST_SIZE 100
+
+#endif /* TEST_CONFIG_H */
diff --git a/relapack/test/lapack.h b/relapack/test/lapack.h
new file mode 100644
index 000000000..80f5c419e
--- /dev/null
+++ b/relapack/test/lapack.h
@@ -0,0 +1,64 @@
+#ifndef LAPACK_H2
+#define LAPACK_H2
+
+#include "../config.h"
+
+void LAPACK(slauum)(const char *, const int *, float *, const int *, int *);
+void LAPACK(dlauum)(const char *, const int *, double *, const int *, int *);
+void LAPACK(clauum)(const char *, const int *, float *, const int *, int *);
+void LAPACK(zlauum)(const char *, const int *, double *, const int *, int *);
+
+void LAPACK(strtri)(const char *, const char *, const int *, float *, const int *, int *);
+void LAPACK(dtrtri)(const char *, const char *, const int *, double *, const int *, int *);
+void LAPACK(ctrtri)(const char *, const char *, const int *, float *, const int *, int *);
+void LAPACK(ztrtri)(const char *, const char *, const int *, double *, const int *, int *);
+
+void LAPACK(spotrf)(const char *, const int *, float *, const int *, int *);
+void LAPACK(dpotrf)(const char *, const int *, double *, const int *, int *);
+void LAPACK(cpotrf)(const char *, const int *, float *, const int *, int *);
+void LAPACK(zpotrf)(const char *, const int *, double *, const int *, int *);
+
+void LAPACK(spbtrf)(const char *, const int *, const int *, float *, const int *, int *);
+void LAPACK(dpbtrf)(const char *, const int *, const int *, double *, const int *, int *);
+void LAPACK(cpbtrf)(const char *, const int *, const int *, float *, const int *, int *);
+void LAPACK(zpbtrf)(const char *, const int *, const int *, double *, const int *, int *);
+
+void LAPACK(ssytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(dsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(csytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(chetrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(zsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(zhetrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(ssytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(dsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(csytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(chetrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
+void LAPACK(zsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+void LAPACK(zhetrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
+
+void LAPACK(sgetrf)(const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(dgetrf)(const int *, const int *, double *, const int *, int *, int *);
+void LAPACK(cgetrf)(const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(zgetrf)(const int *, const int *, double *, const int *, int *, int *);
+
+void LAPACK(sgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(dgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+void LAPACK(cgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
+void LAPACK(zgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
+
+void LAPACK(ssygst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void LAPACK(dsygst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+void LAPACK(chegst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
+void LAPACK(zhegst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
+
+void LAPACK(strsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void LAPACK(dtrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+void LAPACK(ctrsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
+void LAPACK(ztrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
+
+void LAPACK(stgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void LAPACK(dtgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+void LAPACK(ctgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
+void LAPACK(ztgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
+
+#endif /* LAPACK_H2 */
diff --git a/relapack/test/test.h b/relapack/test/test.h
new file mode 100644
index 000000000..24089f3a8
--- /dev/null
+++ b/relapack/test/test.h
@@ -0,0 +1,136 @@
+#ifndef TEST_H
+#define TEST_H
+
+#include "../config.h"
+#include "config.h"
+
+#if BLAS_UNDERSCORE
+#define BLAS(routine) routine ## _
+#else
+#define BLAS(routine) routine
+#endif
+
+#if LAPACK_UNDERSCORE
+#define LAPACK(routine) routine ## _
+#else
+#define LAPACK(routine) routine
+#endif
+
+#include "../inc/relapack.h"
+#include "lapack.h"
+#include "util.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// some name mangling macros
+#define CAT(A, B) A ## B
+#define XCAT(A, B) CAT(A, B)
+#define XLAPACK(X) LAPACK(X)
+#define XRELAPACK(X) XCAT(RELAPACK_, X)
+#define STR(X) #X
+#define XSTR(X) STR(X)
+
+// default setup and error computation names: pre() and post()
+#define PRE pre
+#define POST post
+
+// TEST macro:
+// run setup (pre()), ReLAPACK routine (i = 0), LAPACK routine (i = 1), compute
+// error (post()), check error bound, and print setup and error
+#define TEST(...) \
+ PRE(); \
+ i = 0; \
+ XRELAPACK(ROUTINE)(__VA_ARGS__); \
+ i = 1; \
+ XLAPACK(ROUTINE)(__VA_ARGS__); \
+ POST(); \
+ fail |= error > ERR_BOUND; \
+ printf("%s(%s)\t%g\n", XSTR(ROUTINE), #__VA_ARGS__, error);
+
+// generalized datatype treatment: DT_PREFIX determines the type s, d, c, or z
+#define XPREF(A) XCAT(DT_PREFIX, A)
+
+// matrix generation and error computation routines
+#define x2matgen XPREF(2matgen)
+#define x2vecerr XPREF(2vecerr)
+
+// error bounds
+#define ERR_BOUND XPREF(ERR_BOUND_)
+#define sERR_BOUND_ SINGLE_ERR_BOUND
+#define dERR_BOUND_ DOUBLE_ERR_BOUND
+#define cERR_BOUND_ SINGLE_ERR_BOUND
+#define zERR_BOUND_ DOUBLE_ERR_BOUND
+
+// C datatypes
+#define datatype XPREF(datatype_)
+#define sdatatype_ float
+#define ddatatype_ double
+#define cdatatype_ float
+#define zdatatype_ double
+
+// number of C datatype elements per element
+#define x1 XPREF(DT_MULT)
+#define sDT_MULT 1
+#define dDT_MULT 1
+#define cDT_MULT 2
+#define zDT_MULT 2
+
+// typed allocations
+#define xmalloc XPREF(malloc)
+#define imalloc(S) malloc((S) * sizeof(int))
+#define smalloc(S) malloc((S) * sizeof(float))
+#define dmalloc(S) malloc((S) * sizeof(double))
+#define cmalloc(S) malloc((S) * 2 * sizeof(float))
+#define zmalloc(S) malloc((S) * 2 * sizeof(double))
+
+// transpositions
+#define xCTRANS XPREF(CTRANS)
+#define sCTRANS "T"
+#define dCTRANS "T"
+#define cCTRANS "C"
+#define zCTRANS "C"
+
+// some constants
+#define MONE XPREF(MONE)
+const float sMONE[] = { -1. };
+const double dMONE[] = { -1. };
+const float cMONE[] = { -1., 0. };
+const double zMONE[] = { -1., 0. };
+
+#define ZERO XPREF(ZERO)
+const float sZERO[] = { 0. };
+const double dZERO[] = { 0. };
+const float cZERO[] = { 0., 0. };
+const double zZERO[] = { 0., 0. };
+
+#define ONE XPREF(ONE)
+const float sONE[] = { 1. };
+const double dONE[] = { 1. };
+const float cONE[] = { 1., 0. };
+const double zONE[] = { 1., 0. };
+
+const int iMONE[] = { -1 };
+const int iZERO[] = { 0 };
+const int iONE[] = { 1 };
+const int iTWO[] = { 2 };
+const int iTHREE[] = { 3 };
+const int iFOUR[] = { 4 };
+
+void tests();
+
+// global variables (used in tests(), pre(), and post())
+int i, n, n2, fail;
+double error;
+
+int main(int argc, char* argv[]) {
+ n = TEST_SIZE;
+ n2 = (3 * n) / 4;
+ fail = 0;
+
+ tests();
+
+ return fail;
+}
+
+#endif /* TEST_H */
diff --git a/relapack/test/util.c b/relapack/test/util.c
new file mode 100644
index 000000000..e0fca3eec
--- /dev/null
+++ b/relapack/test/util.c
@@ -0,0 +1,116 @@
+#include "util.h"
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+///////////////////////
+// matrix generation //
+///////////////////////
+// Each routine x2matgen is passed the size (m, n) of the desired matrix and
+// geneartes two copies of such a matrix in in its output arguments A and B.
+// The generated matrices is filled with random entries in [0, 1[ (+i*[0, 1[ in
+// the complex case). Then m is added to the diagonal; this is numerically
+// favorable for routines working with triangular and symmetric matrices. For
+// the same reason the imaginary part of the diagonal is set to 0.
+
+void s2matgen(const int m, const int n, float *A, float *B) {
+ srand(time(NULL) + (size_t) A);
+ int i, j;
+ for (i = 0; i < m; i++)
+ for (j = 0; j < n; j++)
+ A[i + m * j] = B[i + m * j] = (float) rand() / RAND_MAX + m * (i == j);
+}
+
+void d2matgen(const int m, const int n, double *A, double *B) {
+ srand(time(NULL) + (size_t) A);
+ int i, j;
+ for (i = 0; i < m; i++)
+ for (j = 0; j < n; j++)
+ A[i + m * j] = B[i + m * j] = (double) rand() / RAND_MAX + m * (i == j);
+}
+
+void c2matgen(const int m, const int n, float *A, float *B) {
+ srand(time(NULL) + (size_t) A);
+ int i, j;
+ for (i = 0; i < m; i++)
+ for (j = 0; j < n; j++) {
+ A[2* (i + m * j)] = B[2 * (i + m * j)] = (float) rand() / RAND_MAX + m * (i == j);
+ A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((float) rand() / RAND_MAX) * (i != j);
+ }
+}
+
+void z2matgen(const int m, const int n, double *A, double *B) {
+ srand(time(NULL) + (size_t) A);
+ int i, j;
+ for (i = 0; i < m; i++)
+ for (j = 0; j < n; j++) {
+ A[2* (i + m * j)] = B[2 * (i + m * j)] = (double) rand() / RAND_MAX + m * (i == j);
+ A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((double) rand() / RAND_MAX) * (i != j);
+ }
+}
+
+////////////////////////
+// error computations //
+////////////////////////
+// Each routine x2vecerrr is passed a vector lengh n and two vectors x and y.
+// It returns the maximum of the element-wise error between these two vectors.
+// This error is the minimum of the absolute difference and the relative
+// differene with respect to y.
+
+double i2vecerr(const int n, const int *x, const int *y) {
+ double error = 0;
+ int i;
+ for (i = 0; i < n; i++) {
+ double nom = abs(x[i] - y[i]);
+ double den = abs(y[i]);
+ error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+ }
+ return error;
+}
+
+double s2vecerr(const int n, const float *x, const float *y) {
+ float error = 0;
+ int i;
+ for (i = 0; i < n; i++) {
+ double nom = fabs((double) x[i] - y[i]);
+ double den = fabs(y[i]);
+ error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+ }
+ return error;
+}
+
+double d2vecerr(const int n, const double *x, const double *y) {
+ double error = 0;
+ int i;
+ for (i = 0; i < n; i++) {
+ double nom = fabs(x[i] - y[i]);
+ double den = fabs(y[i]);
+ error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+ }
+ return error;
+}
+
+double c2vecerr(const int n, const float *x, const float *y) {
+ double error = 0;
+ int i;
+ for (i = 0; i < n; i++) {
+ double nom = sqrt(((double) x[2 * i] - y[2 * i]) * ((double) x[2 * i] - y[2 * i]) + ((double) x[2 * i + 1] - y[2 * i + 1]) * ((double) x[2 * i + 1] - y[2 * i + 1]));
+ double den = sqrt((double) y[2 * i] * y[2 * i] + (double) y[2 * i + 1] * y[2 * i + 1]);
+ error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+ }
+ return error;
+}
+
+double z2vecerr(const int n, const double *x, const double *y) {
+ double error = 0;
+ int i;
+ for (i = 0; i < n; i++) {
+ double nom = sqrt((x[2 * i] - y[2 * i]) * (x[2 * i] - y[2 * i]) + (x[2 * i + 1] - y[2 * i + 1]) * (x[2 * i + 1] - y[2 * i + 1]));
+ double den = sqrt(y[2 * i] * y[2 * i] + y[2 * i + 1] * y[2 * i + 1]);
+ error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom);
+ }
+ return error;
+}
diff --git a/relapack/test/util.h b/relapack/test/util.h
new file mode 100644
index 000000000..11d2999e0
--- /dev/null
+++ b/relapack/test/util.h
@@ -0,0 +1,15 @@
+#ifndef TEST_UTIL_H
+#define TEST_UTIL_H
+
+void s2matgen(int, int, float *, float *);
+void d2matgen(int, int, double *, double *);
+void c2matgen(int, int, float *, float *);
+void z2matgen(int, int, double *, double *);
+
+double i2vecerr(int, const int *, const int *);
+double s2vecerr(int, const float *, const float *);
+double d2vecerr(int, const double *, const double *);
+double c2vecerr(int, const float *, const float *);
+double z2vecerr(int, const double *, const double *);
+
+#endif /* TEST_UTIL_H */
diff --git a/relapack/test/xgbtrf.c b/relapack/test/xgbtrf.c
new file mode 100644
index 000000000..f255006a5
--- /dev/null
+++ b/relapack/test/xgbtrf.c
@@ -0,0 +1,43 @@
+#include "test.h"
+
+datatype *A[2];
+int *ipiv[2], info;
+int kl, ku, ld;
+
+void pre() {
+ int i;
+ x2matgen(ld, n, A[0], A[1]);
+ for (i = 0; i < n; i++) {
+ // set diagonal
+ A[0][x1 * (i + ld * i)] =
+ A[1][x1 * (i + ld * i)] = (datatype) rand() / RAND_MAX;
+ }
+ memset(ipiv[0], 0, n * sizeof(int));
+ memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+ error = x2vecerr(ld * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+ kl = n - 10;
+ ku = n;
+ ld = 2 * kl + ku + 1;
+
+ A[0] = xmalloc(ld * n);
+ A[1] = xmalloc(ld * n);
+ ipiv[0] = imalloc(n);
+ ipiv[1] = imalloc(n);
+
+ #define ROUTINE XPREF(gbtrf)
+
+ TEST(&n, &n, &kl, &ku, A[i], &ld, ipiv[i], &info);
+ TEST(&n, &n2, &kl, &ku, A[i], &ld, ipiv[i], &info);
+ TEST(&n2, &n, &kl, &ku, A[i], &ld, ipiv[i], &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(ipiv[0]);
+ free(ipiv[1]);
+}
diff --git a/relapack/test/xgemmt.c b/relapack/test/xgemmt.c
new file mode 100644
index 000000000..ffc37049d
--- /dev/null
+++ b/relapack/test/xgemmt.c
@@ -0,0 +1,65 @@
+#include "test.h"
+
+datatype *A[2], *B[2], *C[2], *Ctmp;
+int info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+ x2matgen(n, n, B[0], B[1]);
+ x2matgen(n, n, C[0], C[1]);
+}
+
+void post() {
+ error = x2vecerr(n * n, C[0], C[1]);
+}
+
+#define ROUTINE XPREF(gemmt)
+
+#define xlacpy XPREF(LAPACK(lacpy))
+#define xgemm XPREF(BLAS(gemm))
+
+extern void xlacpy(const char *, const int *, const int *, const datatype *, const int *, datatype *, const int *);
+extern void xgemm(const char *, const char *, const int *, const int *, const int *, const datatype *, const datatype *, const int *, const datatype *, const int *, const datatype *, const datatype *, const int*);
+
+void XLAPACK(ROUTINE)(
+ const char *uplo, const char *transA, const char *transB,
+ const int *n, const int *k,
+ const datatype *alpha, const datatype *A, const int *ldA,
+ const datatype *B, const int *ldB,
+ const datatype *beta, datatype *C, const int *ldC
+) {
+ xlacpy(uplo, n, n, C, ldC, Ctmp, n);
+ xgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, Ctmp, n);
+ xlacpy(uplo, n, n, Ctmp, ldC, C, n);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ B[0] = xmalloc(n * n);
+ B[1] = xmalloc(n * n);
+ C[0] = xmalloc(n * n);
+ C[1] = xmalloc(n * n);
+ Ctmp = xmalloc(n * n);
+
+ TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n);
+ TEST("L", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("L", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("L", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("L", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n);
+ TEST("U", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("U", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("U", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+ TEST("U", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n);
+
+ free(A[0]);
+ free(A[1]);
+ free(B[0]);
+ free(B[1]);
+ free(C[0]);
+ free(C[1]);
+ free(Ctmp);
+}
diff --git a/relapack/test/xgetrf.c b/relapack/test/xgetrf.c
new file mode 100644
index 000000000..4484a24af
--- /dev/null
+++ b/relapack/test/xgetrf.c
@@ -0,0 +1,32 @@
+#include "test.h"
+
+datatype *A[2];
+int *ipiv[2], info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+ memset(ipiv[0], 0, n * sizeof(int));
+ memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ ipiv[0] = imalloc(n);
+ ipiv[1] = imalloc(n);
+
+ #define ROUTINE XPREF(getrf)
+
+ TEST(&n, &n, A[i], &n, ipiv[i], &info);
+ TEST(&n, &n2, A[i], &n, ipiv[i], &info);
+ TEST(&n2, &n, A[i], &n, ipiv[i], &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(ipiv[0]);
+ free(ipiv[1]);
+}
diff --git a/relapack/test/xhegst.c b/relapack/test/xhegst.c
new file mode 100644
index 000000000..c318ef546
--- /dev/null
+++ b/relapack/test/xhegst.c
@@ -0,0 +1,32 @@
+#include "test.h"
+
+datatype *A[2], *B[2];
+int info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+ x2matgen(n, n, B[0], B[1]);
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ B[0] = xmalloc(n * n);
+ B[1] = xmalloc(n * n);
+
+ #define ROUTINE XPREF(hegst)
+
+ TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info);
+ TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info);
+ TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info);
+ TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(B[0]);
+ free(B[1]);
+}
diff --git a/relapack/test/xhetrf.c b/relapack/test/xhetrf.c
new file mode 100644
index 000000000..b5d54bdff
--- /dev/null
+++ b/relapack/test/xhetrf.c
@@ -0,0 +1,40 @@
+#include "test.h"
+
+datatype *A[2], *Work;
+int *ipiv[2], info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+ memset(ipiv[0], 0, n * sizeof(int));
+ memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+ const int lWork = n * n;
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ ipiv[0] = imalloc(n);
+ ipiv[1] = imalloc(n);
+ Work = xmalloc(lWork);
+
+ #define ROUTINE XPREF(hetrf)
+
+ TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+ TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+ #undef ROUTINE
+ #define ROUTINE XPREF(hetrf_rook)
+
+ TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+ TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(ipiv[0]);
+ free(ipiv[1]);
+ free(Work);
+}
diff --git a/relapack/test/xlauum.c b/relapack/test/xlauum.c
new file mode 100644
index 000000000..d2c42fa01
--- /dev/null
+++ b/relapack/test/xlauum.c
@@ -0,0 +1,25 @@
+#include "test.h"
+
+datatype *A[2];
+int info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+
+ #define ROUTINE XPREF(lauum)
+
+ TEST("L", &n, A[i], &n, &info);
+ TEST("U", &n, A[i], &n, &info);
+
+ free(A[0]);
+ free(A[1]);
+}
diff --git a/relapack/test/xpbtrf.c b/relapack/test/xpbtrf.c
new file mode 100644
index 000000000..9a9babb6b
--- /dev/null
+++ b/relapack/test/xpbtrf.c
@@ -0,0 +1,40 @@
+#include "test.h"
+
+datatype *A[2];
+int info[2];
+int n;
+
+void pre() {
+ int i;
+ x2matgen(n, n, A[0], A[1]);
+ for (i = 0; i < n; i++) {
+ // set diagonal
+ A[0][x1 * (i + n * i)] =
+ A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+ // set first row
+ A[0][x1 * (n * i)] =
+ A[1][x1 * (n * i)] = (datatype) rand() / RAND_MAX + n;
+ }
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+
+ #define ROUTINE XPREF(pbtrf)
+
+ const int
+ kd1 = n / 4,
+ kd2 = n * 3 / 4;
+ TEST("L", &n, &kd1, A[i], &n, &info[i]);
+ TEST("L", &n, &kd2, A[i], &n, &info[i]);
+ TEST("U", &n, &kd1, A[i] - x1 * kd1, &n, &info[i]);
+ TEST("U", &n, &kd2, A[i] - x1 * kd2, &n, &info[i]);
+
+ free(A[0]);
+ free(A[1]);
+}
diff --git a/relapack/test/xpotrf.c b/relapack/test/xpotrf.c
new file mode 100644
index 000000000..5e04d426f
--- /dev/null
+++ b/relapack/test/xpotrf.c
@@ -0,0 +1,25 @@
+#include "test.h"
+
+datatype *A[2];
+int info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+
+ #define ROUTINE XPREF(potrf)
+
+ TEST("L", &n, A[i], &n, &info);
+ TEST("U", &n, A[i], &n, &info);
+
+ free(A[0]);
+ free(A[1]);
+}
diff --git a/relapack/test/xsygst.c b/relapack/test/xsygst.c
new file mode 100644
index 000000000..b473a5919
--- /dev/null
+++ b/relapack/test/xsygst.c
@@ -0,0 +1,32 @@
+#include "test.h"
+
+datatype *A[2], *B[2];
+int info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+ x2matgen(n, n, B[0], B[1]);
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ B[0] = xmalloc(n * n);
+ B[1] = xmalloc(n * n);
+
+ #define ROUTINE XPREF(sygst)
+
+ TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info);
+ TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info);
+ TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info);
+ TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(B[0]);
+ free(B[1]);
+}
diff --git a/relapack/test/xsytrf.c b/relapack/test/xsytrf.c
new file mode 100644
index 000000000..82d626f6f
--- /dev/null
+++ b/relapack/test/xsytrf.c
@@ -0,0 +1,40 @@
+#include "test.h"
+
+datatype *A[2], *Work;
+int *ipiv[2], info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+ memset(ipiv[0], 0, n * sizeof(int));
+ memset(ipiv[1], 0, n * sizeof(int));
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]);
+}
+
+void tests() {
+ const int lWork = n * n;
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ ipiv[0] = imalloc(n);
+ ipiv[1] = imalloc(n);
+ Work = xmalloc(lWork);
+
+ #define ROUTINE XPREF(sytrf)
+
+ TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+ TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+ #undef ROUTINE
+ #define ROUTINE XPREF(sytrf_rook)
+
+ TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+ TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(ipiv[0]);
+ free(ipiv[1]);
+ free(Work);
+}
diff --git a/relapack/test/xtgsyl.c b/relapack/test/xtgsyl.c
new file mode 100644
index 000000000..74db5005e
--- /dev/null
+++ b/relapack/test/xtgsyl.c
@@ -0,0 +1,94 @@
+#include "test.h"
+
+datatype *A[2], *B[2], *C[2], *D[2], *E[2], *F[2], *Work, scale[2], dif[2];
+int *iWork, lWork, info;
+
+#define xlascl XPREF(LAPACK(lascl))
+void xlascl(const char *, const int *, const int *, const datatype *, const
+ datatype *, const int *, const int *, datatype *, const int *, int *);
+
+#define xscal XPREF(LAPACK(scal))
+void xscal(const int *, const datatype *, datatype *, const int *);
+
+void pre() {
+ int i;
+
+ x2matgen(n, n, A[0], A[1]);
+ x2matgen(n, n, B[0], B[1]);
+ x2matgen(n, n, C[0], C[1]);
+ x2matgen(n, n, D[0], D[1]);
+ x2matgen(n, n, E[0], E[1]);
+ x2matgen(n, n, F[0], F[1]);
+
+ for (i = 0; i < n; i++) {
+ // set diagonal
+ A[0][x1 * (i + n * i)] =
+ A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+ E[0][x1 * (i + n * i)] =
+ E[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+ // clear first subdiagonal
+ A[0][x1 * (i + 1 + n * i)] =
+ A[1][x1 * (i + 1 + n * i)] =
+ B[0][x1 * (i + 1 + n * i)] =
+ B[1][x1 * (i + 1 + n * i)] =
+ A[0][x1 * (i + 1 + n * i) + x1 - 1] =
+ A[1][x1 * (i + 1 + n * i) + x1 - 1] =
+ B[0][x1 * (i + 1 + n * i) + x1 - 1] =
+ B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0;
+ }
+}
+
+
+void post() {
+ if (scale[0] != 1 || scale[0] != 1)
+ printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]);
+ if (scale[0]) {
+ xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info);
+ xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, F[0], &n, &info);
+ }
+ error = x2vecerr(n * n, C[0], C[1]) + x2vecerr(n * n, F[0], F[1]);
+}
+
+void tests() {
+ lWork = 2 * n * n;
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ B[0] = xmalloc(n * n);
+ B[1] = xmalloc(n * n);
+ C[0] = xmalloc(n * n);
+ C[1] = xmalloc(n * n);
+ D[0] = xmalloc(n * n);
+ D[1] = xmalloc(n * n);
+ E[0] = xmalloc(n * n);
+ E[1] = xmalloc(n * n);
+ F[0] = xmalloc(n * n);
+ F[1] = xmalloc(n * n);
+ Work = xmalloc(lWork);
+ iWork = imalloc(n + n + 2);
+
+ #define ROUTINE XPREF(tgsyl)
+
+ TEST("N", iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+ TEST("N", iZERO, &n2, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+ TEST("N", iZERO, &n, &n2, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+ TEST("N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+ TEST("N", iTWO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+ TEST("N", iTHREE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+ TEST("N", iFOUR, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+ TEST(xCTRANS, iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(B[0]);
+ free(B[1]);
+ free(C[0]);
+ free(C[1]);
+ free(D[0]);
+ free(D[1]);
+ free(E[0]);
+ free(E[1]);
+ free(F[0]);
+ free(F[1]);
+ free(Work);
+ free(iWork);
+}
diff --git a/relapack/test/xtrsyl.c b/relapack/test/xtrsyl.c
new file mode 100644
index 000000000..358a89242
--- /dev/null
+++ b/relapack/test/xtrsyl.c
@@ -0,0 +1,65 @@
+#include "test.h"
+
+datatype *A[2], *B[2], *C[2], *Work, scale[2];
+int info;
+
+#define xlascl XPREF(LAPACK(lascl))
+void xlascl(const char *, const int *, const int *, const datatype *, const
+ datatype *, const int *, const int *, datatype *, const int *, int *);
+
+void pre() {
+ int i;
+
+ x2matgen(n, n, A[0], A[1]);
+ x2matgen(n, n, B[0], B[1]);
+ x2matgen(n, n, C[0], C[1]);
+
+ for (i = 0; i < n; i++) {
+ // set diagonal
+ A[0][x1 * (i + n * i)] =
+ A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX;
+ // clear first subdiagonal
+ A[0][x1 * (i + 1 + n * i)] =
+ A[1][x1 * (i + 1 + n * i)] =
+ B[0][x1 * (i + 1 + n * i)] =
+ B[1][x1 * (i + 1 + n * i)] =
+ A[0][x1 * (i + 1 + n * i) + x1 - 1] =
+ A[1][x1 * (i + 1 + n * i) + x1 - 1] =
+ B[0][x1 * (i + 1 + n * i) + x1 - 1] =
+ B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0;
+ }
+}
+
+void post() {
+ if (scale[0] != 1 || scale[0] != 1)
+ printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]);
+ if (scale[0])
+ xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info);
+ error = x2vecerr(n * n, C[0], C[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+ B[0] = xmalloc(n * n);
+ B[1] = xmalloc(n * n);
+ C[0] = xmalloc(n * n);
+ C[1] = xmalloc(n * n);
+
+ #define ROUTINE XPREF(trsyl)
+
+ TEST("N", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+ TEST("N", "N", iONE, &n2, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+ TEST("N", "N", iONE, &n, &n2, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+ TEST("C", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+ TEST("N", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+ TEST("C", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+ TEST("N", "N", iMONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info);
+
+ free(A[0]);
+ free(A[1]);
+ free(B[0]);
+ free(B[1]);
+ free(C[0]);
+ free(C[1]);
+}
diff --git a/relapack/test/xtrtri.c b/relapack/test/xtrtri.c
new file mode 100644
index 000000000..106391bc8
--- /dev/null
+++ b/relapack/test/xtrtri.c
@@ -0,0 +1,25 @@
+#include "test.h"
+
+datatype *A[2];
+int info;
+
+void pre() {
+ x2matgen(n, n, A[0], A[1]);
+}
+
+void post() {
+ error = x2vecerr(n * n, A[0], A[1]);
+}
+
+void tests() {
+ A[0] = xmalloc(n * n);
+ A[1] = xmalloc(n * n);
+
+ #define ROUTINE XPREF(trtri)
+
+ TEST("L", "N", &n, A[i], &n, &info);
+ TEST("U", "N", &n, A[i], &n, &info);
+
+ free(A[0]);
+ free(A[1]);
+}
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 9cf518e05..bd31ed9c6 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
target_link_libraries(${OpenBLAS_utest_bin} m)
endif()
+if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
+set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS")
+endif()
+
#Set output for utest
set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})