diff options
author | Martin Kroeker <martin@ruby.chemie.uni-freiburg.de> | 2017-07-15 10:40:42 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-07-15 10:40:42 +0200 |
commit | 7294fb1d9db59bc8e7b58bd4dd758003092f0886 (patch) | |
tree | b70627a4f172eb32fd4f7aed97172b4836be95dd | |
parent | 88249ca5f793f9d18584e5388a88054651b9bb7b (diff) | |
parent | 31e086d6a66658e7b04390e69884fe569e0a1e9d (diff) | |
download | openblas-7294fb1d9db59bc8e7b58bd4dd758003092f0886.tar.gz openblas-7294fb1d9db59bc8e7b58bd4dd758003092f0886.tar.bz2 openblas-7294fb1d9db59bc8e7b58bd4dd758003092f0886.zip |
Merge branch 'develop' into cgroups
169 files changed, 22840 insertions, 843 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index c20a57eac..e6ae891b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME} DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h ) - ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h) + + ADD_CUSTOM_TARGET(genconfig + ALL + DEPENDS openblas_config.h + ) add_dependencies(genconfig ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) @@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME} message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(genf77blas + ALL COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h ) @@ -255,11 +260,11 @@ if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") ADD_CUSTOM_TARGET(gencblas + ALL COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h ) - add_dependencies(gencblas ${OpenBLAS_LIBNAME}) install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) @@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1) SUBDIRS += lapack endif +RELA = +ifeq ($(BUILD_RELAPACK), 1) +RELA = re_lapack +endif + LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench -.PHONY : all libs netlib test ctest shared install -.NOTPARALLEL : all libs prof lapack-test install blas-test +.PHONY : all libs netlib $(RELA) test ctest shared install +.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test -all :: libs netlib tests shared +all :: libs netlib $(RELA) tests shared @echo @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" @echo @@ -215,6 +220,14 @@ ifndef NO_LAPACKE endif endif +ifeq ($(NO_LAPACK), 1) +re_lapack : + +else +re_lapack : + @$(MAKE) -C relapack +endif + prof_lapack : lapack_prebuild @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof @@ -326,11 +339,7 @@ endif @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h + @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) @echo Done. - -# Makefile debugging trick: -# call print-VARIABLE to see the runtime value of any variable -print-%: - @echo '$*=$($*)' diff --git a/Makefile.arm b/Makefile.arm index c189b0c47..eedd39b73 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,5 +1,4 @@ -#ifeq logical or -ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) CCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a @@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a endif endif -ifeq ($(CORE), ARMV7) -ifeq ($(OSNAME), Android) -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a -else -CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -endif -else -CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -endif -endif - ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp -march=armv6 endif - ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -march=armv5 -FCOMMON_OPT += -marm -march=armv5 +CCOMMON_OPT += -march=armv5 +FCOMMON_OPT += -march=armv5 endif diff --git a/Makefile.arm64 b/Makefile.arm64 index 7e9df2f4b..d19e796a5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx endif ifeq ($(CORE), THUNDERX2T99) -CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan -FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan +CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 +FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 endif diff --git a/Makefile.rule b/Makefile.rule index b6c22f798..8d8aecdc9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -83,6 +83,9 @@ VERSION = 0.2.20.dev # Build LAPACK Deprecated functions since LAPACK 3.6.0 BUILD_LAPACK_DEPRECATED = 1 +# Build RecursiveLAPACK on top of LAPACK +# BUILD_RELAPACK = 1 + # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1 NO_WARMUP = 1 # If you want to disable CPU/Memory affinity on Linux. -NO_AFFINITY = 1 +#NO_AFFINITY = 1 # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 diff --git a/Makefile.system b/Makefile.system index 29d3efd53..bd361a1a2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -242,6 +242,10 @@ EXTRALIB += -lm NO_EXPRECISION = 1 endif +ifeq ($(OSNAME), Android) +EXTRALIB += -lm +endif + ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif @@ -486,12 +490,18 @@ BINARY_DEFINED = 1 CCOMMON_OPT += -marm FCOMMON_OPT += -marm +# If softfp abi is mentioned on the command line, force it. ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI -FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI +CCOMMON_OPT += -mfloat-abi=softfp +FCOMMON_OPT += -mfloat-abi=softfp +endif + +ifeq ($(OSNAME), Android) +ifeq ($(ARM_SOFTFP_ABI), 1) +EXTRALIB += -lm else -CCOMMON_OPT += -mfloat-abi=hard -FCOMMON_OPT += -mfloat-abi=hard +EXTRALIB += -Wl,-lm_hard +endif endif endif @@ -1119,6 +1129,9 @@ LIB_COMPONENTS += LAPACK ifneq ($(NO_LAPACKE), 1) LIB_COMPONENTS += LAPACKE endif +ifeq ($(BUILD_RELAPACK), 1) +LIB_COMPONENTS += ReLAPACK +endif endif ifeq ($(ONLY_CBLAS), 1) diff --git a/cmake/c_check.cmake b/cmake/c_check.cmake index 115bdaf4e..56ae612ea 100644 --- a/cmake/c_check.cmake +++ b/cmake/c_check.cmake @@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF} "#define __${BINARY}BIT__\t1\n" "#define FUNDERSCORE\t${FU}\n") +if (${HOST_OS} STREQUAL "WINDOWSSTORE") + file(APPEND ${TARGET_CONF} + "#define OS_WINNT\t1\n") +endif () + diff --git a/cmake/os.cmake b/cmake/os.cmake index f5a75027c..e9df68d7f 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -77,7 +77,7 @@ if (CYGWIN) set(NO_EXPRECISION 1) endif () -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android") if (SMP) set(EXTRALIB "${EXTRALIB} -lpthread") endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 6a21c0bcc..a7f98bfb8 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -72,20 +72,26 @@ if (MSVC) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) endif() +if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + # disable WindowsStore strict CRT checks + set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) +endif () + set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH_DIR}) -try_compile(GETARCH_RESULT ${GETARCH_DIR} - SOURCES ${GETARCH_SRC} - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} -) - -if (NOT ${GETARCH_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH_RESULT ${GETARCH_DIR} + SOURCES ${GETARCH_SRC} + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} + ) + + if (NOT ${GETARCH_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") + endif () endif () - message(STATUS "Running getarch") # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT}) set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") file(MAKE_DIRECTORY ${GETARCH2_DIR}) -try_compile(GETARCH2_RESULT ${GETARCH2_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GETARCH2_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} -) - -if (NOT ${GETARCH2_RESULT}) - MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GETARCH2_RESULT ${GETARCH2_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GETARCH2_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} + ) + + if (NOT ${GETARCH2_RESULT}) + MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") + endif () endif () # use the cmake binary w/ the -E param to run a shell command in a cross-platform way @@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}") set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) -try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} - SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c - COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} - OUTPUT_VARIABLE GEN_CONFIG_H_LOG - COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} -) - -if (NOT ${GEN_CONFIG_H_RESULT}) - MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") -endif () +if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") + try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} + SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c + COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} + OUTPUT_VARIABLE GEN_CONFIG_H_LOG + COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} + ) + + if (NOT ${GEN_CONFIG_H_RESULT}) + MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") + endif () +endif ()
\ No newline at end of file @@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #endif #ifndef ASSEMBLER +#ifdef OS_WINDOWSSTORE +typedef char env_var_t[MAX_PATH]; +#define readenv(p, n) 0 +#else #ifdef OS_WINDOWS typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) @@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif +#endif #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) #ifdef _POSIX_MONOTONIC_CLOCK @@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){ *address = 0; } - +#ifdef OS_WINDOWSSTORE +static __inline int readenv_atoi(char *env) { + return 0; +} +#else #ifdef OS_WINDOWS static __inline int readenv_atoi(char *env) { env_var_t p; @@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) { return(0); } #endif - +#endif #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) diff --git a/common_arm.h b/common_arm.h index a17acb448..27fa76b76 100644 --- a/common_arm.h +++ b/common_arm.h @@ -111,11 +111,6 @@ REALNAME: #define PROFCODE -#ifdef __ARM_PCS -//-mfloat-abi=softfp -#define SOFT_FLOAT_ABI -#endif - #endif diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c index ef9d58d76..e86b565f8 100644 --- a/driver/level2/gbmv_thread.c +++ b/driver/level2/gbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; - BLASLONG range_m[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index a0377d638..5718c0ec9 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #endif blas_arg_t args; - blas_queue_t queue[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER + 1]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index f8ae3cdcd..035300841 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c index bbb1c50eb..226a922e9 100644 --- a/driver/level2/tbmv_thread.c +++ b/driver/level2/tbmv_thread.c @@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c index 47dc1daf9..c91b52775 100644 --- a/driver/level2/tpmv_thread.c +++ b/driver/level2/tpmv_thread.c @@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 42edb83cb..0a155366c 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; - BLASLONG range_n[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 94274be72..5f40853dc 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)i; - width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; + width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; @@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (nthreads - num_cpu > 1) { di = (double)(arg -> n - i); - width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; + width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); if ((width <= 0) || (width > n_to - i)) width = n_to - i; diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 489d40c76..8e0be1e0e 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -12,6 +12,8 @@ if (SMP) set(BLAS_SERVER blas_server_omp.c) elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(BLAS_SERVER blas_server_win32.c) + elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") + set(BLAS_SERVER blas_server_win32.c) endif () if (NOT DEFINED BLAS_SERVER) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 081bdd7d4..cde8ca793 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ - WaitForSingleObject(blas_threads[i], 5); //INFINITE); - TerminateThread(blas_threads[i],0); + WaitForSingleObject(blas_threads[i], 5); //INFINITE); +#ifndef OS_WINDOWSSTORE +// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP + TerminateThread(blas_threads[i],0); +#endif } blas_server_avail = 0; diff --git a/driver/others/init.c b/driver/others/init.c index 4093776db..1f07a24ac 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -825,10 +825,11 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; - if (common -> magic != SH_MAGIC) { + if (common -> magic != SH_MAGIC) cpu_set_t *cpusetp; int nums; int ret; + #ifdef DEBUG fprintf(stderr, "Shared Memory Initialization.\n"); #endif @@ -883,7 +884,7 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); common -> final_num_procs = 0; - for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 16bde105b..960dae67b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,7 +1,5 @@ +include $(KERNELDIR)/KERNEL.ARMV5 - - -############################################################################### SAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = iamax_vfp.S CAMAXKERNEL = iamax_vfp.S @@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S @@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S @@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S -ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S - SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S SGEMMINCOPYOBJ = sgemm_incopy.o SGEMMITCOPYOBJ = sgemm_itcopy.o +endif SGEMMONCOPY = sgemm_ncopy_2_vfp.S -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x2_vfp.S +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S DGEMMINCOPYOBJ = dgemm_incopy.o DGEMMITCOPYOBJ = dgemm_itcopy.o +endif DGEMMONCOPY = dgemm_ncopy_2_vfp.S DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o @@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d5cd94fbd..5e0b4cfb8 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,91 +1,12 @@ - -################################################################################# -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S - -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S - -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S - -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S - -ISAMAXKERNEL = iamax_vfp.S -IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S - -ISAMINKERNEL = iamax_vfp.S -IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S - -ISMAXKERNEL = iamax_vfp.S -IDMAXKERNEL = iamax_vfp.S - -ISMINKERNEL = iamax_vfp.S -IDMINKERNEL = iamax_vfp.S - -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S - -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c - -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S +include $(KERNELDIR)/KERNEL.ARMV6 SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S - -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S -ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index fe6242a5b..5b08e5028 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -475,6 +475,14 @@ asum_kernel_L999: vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif + bx lr EPILOGUE diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 8e5334f62..37515f399 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_INC_X [fp, #0 ] -#define OLD_Y [fp, #4 ] -#define OLD_INC_Y [fp, #8 ] -#else +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 #define OLD_X [fp, #0 ] #define OLD_INC_X [fp, #4 ] #define OLD_Y [fp, #8 ] #define OLD_INC_Y [fp, #12 ] +#else +#define OLD_ALPHA [fp, #0] +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define OLD_Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] #endif - + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_ALPHAR [fp, #0] +#define OLD_ALPHAI [fp, #8] +#define OLD_X [fp, #16 ] +#define OLD_INC_X [fp, #20 ] +#define OLD_Y [fp, #24 ] +#define OLD_INC_Y [fp, #28 ] +#endif + +#endif //!defined(COMPLEX) + +#else //__ARM_PCS_VFP + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + +#endif //!defined(__ARM_PCS_VFP) + #define N r0 #define Y r1 #define INC_X r2 @@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define FMAC_R1 fmacd -#define FMAC_R2 fnmacd +#define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs -#define FMAC_R2 fnmacs +#define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd -#define FMAC_I1 fnmacd +#define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else #define FMAC_R1 fmacs #define FMAC_R2 fmacs -#define FMAC_I1 fnmacs +#define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 sub sp, sp, #STACKSIZE // reserve stack -#ifdef ARM_SOFTFP_ABI -#ifndef DOUBLE - vmov s0, r3 //move alpha to s0 +#if !defined(__ARM_PCS_VFP) +#if !defined(COMPLEX) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA ldr X, OLD_X +#else + vldr d0, OLD_ALPHA + ldr X, OLD_X +#endif +#else //COMPLEX +#if !defined(DOUBLE) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr X, OLD_X +#else + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr X, OLD_X +#endif #endif #endif - + ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index 0497b6d83..e5a6e4d35 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov s2, s0 vmov s3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y +#endif cmp N, #0 ble cdot_kernel_L999 @@ -265,7 +283,6 @@ cdot_kernel_S10: cdot_kernel_L999: - sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers @@ -276,8 +293,11 @@ cdot_kernel_L999: vadd.f32 s0 , s0, s2 vsub.f32 s1 , s1, s3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {s0 - s1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index f0517cb47..71bc50efd 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index cf132a184..9d473ad78 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs - #define FMAC_R2 fnmacs + #define FMAC_R1 vmls.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs @@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmacs + #define FMAC_R1 vmls.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs - #define FMAC_I2 fnmacs + #define FMAC_I1 vmls.f32 + #define FMAC_I2 vmls.f32 #endif @@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 5d2748644..62ee33bb9 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr s0 , ALPHA_R diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 76c8a8f18..c07b6d6f8 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble cgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index 8cb7ede9d..aae890ea9 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(CN) || defined(CT) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define KMAC_R fmacs - #define KMAC_I fnmacs + #define KMAC_I vmls.f32 #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #else - #define KMAC_R fnmacs + #define KMAC_R vmls.f32 #define KMAC_I fmacs #define FMAC_R1 fmacs #define FMAC_R2 fmacs - #define FMAC_I1 fnmacs + #define FMAC_I1 vmls.f32 #define FMAC_I2 fmacs #endif @@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 97bd88c69..79e7ed07f 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls - #define FMAC_R2 fnmacs + #define FMAC_R1 vnmul.f32 + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls - #define FMAC_I2 fnmacs + #define FMAC_I2 vmls.f32 #elif defined(CN) || defined(CT) @@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuls #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls + #define FMAC_I1 vnmul.f32 #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) @@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubs #define FMAC_R1 fmuls - #define FMAC_R2 fnmacs + #define FMAC_R2 vmls.f32 #define FMAC_I1 fmuls #define FMAC_I2 fmacs @@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubs #define FADD_I fadds - #define FMAC_R1 fnmuls + #define FMAC_R1 vnmul.f32 #define FMAC_R2 fmacs - #define FMAC_I1 fnmuls - #define FMAC_I2 fnmacs + #define FMAC_I1 vnmul.f32 + #define FMAC_I2 vmls.f32 #endif @@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index f28acbae3..fb294d8b4 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -246,6 +246,9 @@ ddot_kernel_L999: vldm r3, { d8 - d15} // restore floating point registers vadd.f64 d0 , d0, d1 // set return value +#if !defined(__ARM_PCS_VFP) + vmov r0, r1, d0 +#endif sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 183269d1b..001a6050c 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] - +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index b14052e06..1744b54d8 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index c578d2b1e..3d6fbf8e9 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c7e455f16..c0c6a1677 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 385370b7f..7c154d741 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_M r0 @@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 93bf23e49..54f958b7b 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + #endif +#define OLD_A r3 #define OLD_M r0 #define AO1 r0 @@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A str OLD_M, M - - + ldr INC_X , OLD_INC_X ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 816be54ff..9559d1829 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard abi -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_N r1 #define M r0 @@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index 7ae5799bc..b1d3dadf1 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_N r1 @@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index b3bd28152..16ac5a632 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -574,6 +574,13 @@ nrm2_kernel_L999: vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 #endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif bx lr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 7af966895..84977901d 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -503,8 +503,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 -#ifdef ARM_SOFTFP_ABI - vmov r0, s0 +#endif + +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else + vmov r0, s0 #endif #endif diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index d053423b6..25f563690 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_INC_Y [fp, #0 ] +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) +#define OLD_C [fp, #4] +#define OLD_S [fp, #8] +#else +#define OLD_C [fp, #8] +#define OLD_S [fp, #16] +#endif +#endif #define N r0 #define X r1 @@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d5 vmul.f64 d3 , d0, d5 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X, { d2 } fstmiad Y, { d3 } @@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s5 vmul.f32 s3 , s0, s5 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X, { s2 } fstmias Y, { s3 } @@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 fstmiad X!, { d2 } fstmiad Y!, { d3 } vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 fstmiad X!, { d2 } fstmiad Y!, { d3 } @@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f64 d2 , d0, d4 fmacd d2 , d1, d6 vmul.f64 d3 , d0, d6 - fnmacd d3 , d1, d4 + vmls.f64 d3 , d1, d4 vstr d2 , [ X, #0 ] vstr d3 , [ Y, #0 ] vmul.f64 d2 , d0, d5 fmacd d2 , d1, d7 vmul.f64 d3 , d0, d7 - fnmacd d3 , d1, d5 + vmls.f64 d3 , d1, d5 vstr d2 , [ X, #8 ] vstr d3 , [ Y, #8 ] @@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 fstmias X!, { s2 } fstmias Y!, { s3 } vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 fstmias X!, { s2 } fstmias Y!, { s3 } @@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmul.f32 s2 , s0, s4 fmacs s2 , s1, s6 vmul.f32 s3 , s0, s6 - fnmacs s3 , s1, s4 + vmls.f32 s3 , s1, s4 vstr s2 , [ X, #0 ] vstr s3 , [ Y, #0 ] vmul.f32 s2 , s0, s5 fmacs s2 , s1, s7 vmul.f32 s3 , s0, s7 - fnmacs s3 , s1, s5 + vmls.f32 s3 , s1, s5 vstr s2 , [ X, #4 ] vstr s3 , [ Y, #4 ] @@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 ldr INC_Y , OLD_INC_Y - +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vldr s0, OLD_C + vldr s1, OLD_S +#else + vldr d0, OLD_C + vldr d1, OLD_S +#endif +#endif cmp N, #0 ble rot_kernel_L999 diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S index a8939c3a2..cc3e3b98d 100644 --- a/kernel/arm/scal_vfp.S +++ b/kernel/arm/scal_vfp.S @@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X!, { d2 - d3 } @@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmiad X, { d4 - d5 } vmul.f64 d2, d0, d4 - fnmacd d2, d1, d5 + vmls.f64 d2, d1, d5 vmul.f64 d3, d0, d5 fmacd d3, d1, d4 fstmiad X, { d2 - d3 } @@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X!, { s2 - s3 } @@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldmias X, { s4 - s5 } vmul.f32 s2, s0, s4 - fnmacs s2, s1, s5 + vmls.f32 s2, s1, s5 vmul.f32 s3, s0, s5 fmacs s3, s1, s4 fstmias X, { s2 - s3 } diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index f3abdc197..5f4f424bf 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -329,20 +329,19 @@ sdot_kernel_L999: vldm r3, { s8 - s15} // restore floating point registers #if defined(DSDOT) - vadd.f64 d0 , d0, d1 // set return value - -#ifdef ARM_SOFTFP_ABI - vmov r0, r1, d0 +#else + vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if defined(DSDOT) + vmov r0, r1, d0 #else - - vadd.f32 s0 , s0, s1 // set return value -#ifdef ARM_SOFTFP_ABI vmov r0, s0 #endif #endif + sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index e8b44b742..1f21e5a1f 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 86198ac90..6491d3571 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 - -#ifdef ARM_SOFTFP_ABI -#define OLD_ALPHA r3 -//#define OLD_A -#else //hard #define OLD_A r3 #define OLD_ALPHA s0 -#endif /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] - -#ifndef ARM_SOFTFP_ABI #define A [fp, #-268 ] -#endif #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, #-240] @@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] -#ifdef ARM_SOFTFP_ABI -#define A [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] #define B [fp, #8 ] #define C [fp, #12 ] #define OLD_LDC [fp, #16 ] -#else //hard +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif - + #define I r0 #define J r1 #define L r2 @@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_M, M str OLD_N, N str OLD_K, K - -#ifdef ARM_SOFTFP_ABI - str OLD_ALPHA, ALPHA -#else //hard str OLD_A, A vstr OLD_ALPHA, ALPHA -#endif + sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 8f97644ec..635b1dd13 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define OLD_C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index 0dd03ac85..e24d24eba 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 352875188..76661da79 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_X [fp, #0 ] +#define OLD_INC_X [fp, #4 ] +#define OLD_Y [fp, #8 ] +#define OLD_INC_Y [fp, #12 ] +#else +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12] +#define OLD_Y [fp, #16] +#define OLD_INC_Y [fp, #20] +#endif + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_X [fp, #16] +#define OLD_INC_X [fp, #20] +#define OLD_Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#endif // !defined(__ARM_PCS_VFP) + +#else #define OLD_INC_X [fp, #0 ] #define OLD_Y [fp, #4 ] #define OLD_INC_Y [fp, #8 ] +#endif #define N r0 @@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. push {r4 , fp} add fp, sp, #8 +#if !defined(__ARM_PCS_VFP) + ldr X, OLD_X +#endif ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 936ce9f60..43f2c0c0b 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vcvt.f64.f32 d2, s0 vcvt.f64.f32 d3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - +#endif cmp N, #0 ble zdot_kernel_L999 @@ -280,8 +297,11 @@ zdot_kernel_L999: vadd.f64 d0 , d0, d2 vsub.f64 d1 , d1, d3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {d0 - d1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 46507c4d2..53d18b07b 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 5a99f792f..a9d4eddeb 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd - #define FMAC_R2 fnmacd + #define FMAC_R1 vmls.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd @@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmacd + #define FMAC_R1 vmls.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd - #define FMAC_I2 fnmacd + #define FMAC_I1 vmls.f64 + #define FMAC_I2 vmls.f64 #endif @@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index da9a91043..3e3a1bc07 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr d0 , ALPHA_R diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 211fa0701..2193083af 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) && !defined(XCONJ) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CONJ) && !defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif !defined(CONJ) && defined(XCONJ) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble zgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index dc80b17b8..cb6bc050e 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(CN) || defined(CT) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define KMAC_R fmacd - #define KMAC_I fnmacd + #define KMAC_I vmls.f64 #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #else - #define KMAC_R fnmacd + #define KMAC_R vmls.f64 #define KMAC_I fmacd #define FMAC_R1 fmacd #define FMAC_R2 fmacd - #define FMAC_I1 fnmacd + #define FMAC_I1 vmls.f64 #define FMAC_I2 fmacd #endif @@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 5a808ccbc..3e6962f06 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld - #define FMAC_R2 fnmacd + #define FMAC_R1 vnmul.f64 + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld - #define FMAC_I2 fnmacd + #define FMAC_I2 vmls.f64 #elif defined(CN) || defined(CT) @@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FMAC_R1 fmuld #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld + #define FMAC_I1 vnmul.f64 #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) @@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_I fsubd #define FMAC_R1 fmuld - #define FMAC_R2 fnmacd + #define FMAC_R2 vmls.f64 #define FMAC_I1 fmuld #define FMAC_I2 fmacd @@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FADD_R fsubd #define FADD_I faddd - #define FMAC_R1 fnmuld + #define FMAC_R1 vnmul.f64 #define FMAC_R2 fmacd - #define FMAC_I1 fnmuld - #define FMAC_I2 fnmacd + #define FMAC_I1 vnmul.f64 + #define FMAC_I2 vmls.f64 #endif @@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c index 93ba50660..7d12c9885 100644 --- a/kernel/power/casum_microk_power8.c +++ b/kernel/power/casum_microk_power8.c @@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c index b2b1bead1..613c4d286 100644 --- a/kernel/power/ccopy_microk_power8.c +++ b/kernel/power/ccopy_microk_power8.c @@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %2, %2, 128 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %5, %2 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "lxvw4x 34, %6, %2 \n\t" - "lxvw4x 35, %7, %2 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "lxvw4x 36, %8, %2 \n\t" - "lxvw4x 37, %9, %2 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" - "lxvw4x 38, %10, %2 \n\t" - "lxvw4x 39, %11, %2 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c index 1dd03dc88..8d7d0c0b9 100644 --- a/kernel/power/cswap_microk_power8.c +++ b/kernel/power/cswap_microk_power8.c @@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "lxvw4x 40, 0, %4 \n\t" - "lxvw4x 41, %5, %4 \n\t" - "lxvw4x 42, %6, %4 \n\t" - "lxvw4x 43, %7, %4 \n\t" - "lxvw4x 44, %8, %4 \n\t" - "lxvw4x 45, %9, %4 \n\t" - "lxvw4x 46, %10, %4 \n\t" - "lxvw4x 47, %11, %4 \n\t" + "lxvd2x 40, 0, %4 \n\t" + "lxvd2x 41, %5, %4 \n\t" + "lxvd2x 42, %6, %4 \n\t" + "lxvd2x 43, %7, %4 \n\t" + "lxvd2x 44, %8, %4 \n\t" + "lxvd2x 45, %9, %4 \n\t" + "lxvd2x 46, %10, %4 \n\t" + "lxvd2x 47, %11, %4 \n\t" "addi %4, %4, -128 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 49, %5, %3 \n\t" - "lxvw4x 50, %6, %3 \n\t" - "lxvw4x 51, %7, %3 \n\t" - "lxvw4x 0, %8, %3 \n\t" - "lxvw4x 1, %9, %3 \n\t" - "lxvw4x 2, %10, %3 \n\t" - "lxvw4x 3, %11, %3 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "lxvd2x 0, %8, %3 \n\t" + "lxvd2x 1, %9, %3 \n\t" + "lxvd2x 2, %10, %3 \n\t" + "lxvd2x 3, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "lxvw4x 4, 0, %3 \n\t" - "lxvw4x 5, %5, %3 \n\t" - "lxvw4x 6, %6, %3 \n\t" - "lxvw4x 7, %7, %3 \n\t" - "lxvw4x 8, %8, %3 \n\t" - "lxvw4x 9, %9, %3 \n\t" - "lxvw4x 10, %10, %3 \n\t" - "lxvw4x 11, %11, %3 \n\t" + "lxvd2x 4, 0, %3 \n\t" + "lxvd2x 5, %5, %3 \n\t" + "lxvd2x 6, %6, %3 \n\t" + "lxvd2x 7, %7, %3 \n\t" + "lxvd2x 8, %8, %3 \n\t" + "lxvd2x 9, %9, %3 \n\t" + "lxvd2x 10, %10, %3 \n\t" + "lxvd2x 11, %11, %3 \n\t" "addi %3, %3, -128 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 48, 0, %4 \n\t" - "stxvw4x 49, %5, %4 \n\t" - "stxvw4x 50, %6, %4 \n\t" - "stxvw4x 51, %7, %4 \n\t" - "stxvw4x 0, %8, %4 \n\t" - "stxvw4x 1, %9, %4 \n\t" - "stxvw4x 2, %10, %4 \n\t" - "stxvw4x 3, %11, %4 \n\t" + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 0, %8, %4 \n\t" + "stxvd2x 1, %9, %4 \n\t" + "stxvd2x 2, %10, %4 \n\t" + "stxvd2x 3, %11, %4 \n\t" "addi %4, %4, 128 \n\t" - "stxvw4x 4, 0, %4 \n\t" - "stxvw4x 5, %5, %4 \n\t" - "stxvw4x 6, %6, %4 \n\t" - "stxvw4x 7, %7, %4 \n\t" - "stxvw4x 8, %8, %4 \n\t" - "stxvw4x 9, %9, %4 \n\t" - "stxvw4x 10, %10, %4 \n\t" - "stxvw4x 11, %11, %4 \n\t" + "stxvd2x 4, 0, %4 \n\t" + "stxvd2x 5, %5, %4 \n\t" + "stxvd2x 6, %6, %4 \n\t" + "stxvd2x 7, %7, %4 \n\t" + "stxvd2x 8, %8, %4 \n\t" + "stxvd2x 9, %9, %4 \n\t" + "stxvd2x 10, %10, %4 \n\t" + "stxvd2x 11, %11, %4 \n\t" "addi %4, %4, 128 \n\t" diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index 08a766f80..4bb515de8 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x) "xvabssp 50, 42 \n\t" "xvabssp 51, 43 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %8, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %8, %2 \n\t" "xvabssp %x3, 44 \n\t" "xvabssp %x4, 45 \n\t" - "lxvw4x 42, %9, %2 \n\t" - "lxvw4x 43, %10, %2 \n\t" + "lxvd2x 42, %9, %2 \n\t" + "lxvd2x 43, %10, %2 \n\t" "xvabssp %x5, 46 \n\t" "xvabssp %x6, 47 \n\t" - "lxvw4x 44, %11, %2 \n\t" - "lxvw4x 45, %12, %2 \n\t" + "lxvd2x 44, %11, %2 \n\t" + "lxvd2x 45, %12, %2 \n\t" "xvaddsp 32, 32, 48 \n\t" "xvaddsp 33, 33, 49 \n\t" - "lxvw4x 46, %13, %2 \n\t" - "lxvw4x 47, %14, %2 \n\t" + "lxvd2x 46, %13, %2 \n\t" + "lxvd2x 47, %14, %2 \n\t" "xvaddsp 34, 34, 50 \n\t" "xvaddsp 35, 35, 51 \n\t" diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 444a6d4d5..7a54d5e1e 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) { __asm__ ( - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 41, %5, %2 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "lxvw4x 42, %6, %2 \n\t" - "lxvw4x 43, %7, %2 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "lxvw4x 44, %8, %2 \n\t" - "lxvw4x 45, %9, %2 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n\t" - "lxvw4x 46, %10, %2 \n\t" - "lxvw4x 47, %11, %2 \n\t" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" "addi %3, %3, 128 \n\t" "addi %2, %2, 128 \n\t" @@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) "2: \n\t" - "stxvw4x 40, 0, %3 \n\t" - "stxvw4x 41, %5, %3 \n\t" - "stxvw4x 42, %6, %3 \n\t" - "stxvw4x 43, %7, %3 \n\t" - "stxvw4x 44, %8, %3 \n\t" - "stxvw4x 45, %9, %3 \n\t" - "stxvw4x 46, %10, %3 \n\t" - "stxvw4x 47, %11, %3 \n" + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n" "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index 7f7ccfac3..bfe100c8b 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" @@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y) "1: \n\t" "xvmaddasp 32, 40, 48 \n\t" - "lxvw4x 40, 0, %2 \n\t" - "lxvw4x 48, 0, %3 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" "xvmaddasp 33, 41, 49 \n\t" - "lxvw4x 41, %10, %2 \n\t" - "lxvw4x 49, %10, %3 \n\t" + "lxvd2x 41, %10, %2 \n\t" + "lxvd2x 49, %10, %3 \n\t" "xvmaddasp 34, 42, 50 \n\t" - "lxvw4x 42, %11, %2 \n\t" - "lxvw4x 50, %11, %3 \n\t" + "lxvd2x 42, %11, %2 \n\t" + "lxvd2x 50, %11, %3 \n\t" "xvmaddasp 35, 43, 51 \n\t" - "lxvw4x 43, %12, %2 \n\t" - "lxvw4x 51, %12, %3 \n\t" + "lxvd2x 43, %12, %2 \n\t" + "lxvd2x 51, %12, %3 \n\t" "xvmaddasp 36, 44, %x4 \n\t" - "lxvw4x 44, %13, %2 \n\t" - "lxvw4x %x4, %13, %3 \n\t" + "lxvd2x 44, %13, %2 \n\t" + "lxvd2x %x4, %13, %3 \n\t" "xvmaddasp 37, 45, %x5 \n\t" - "lxvw4x 45, %14, %2 \n\t" - "lxvw4x %x5, %14, %3 \n\t" + "lxvd2x 45, %14, %2 \n\t" + "lxvd2x %x5, %14, %3 \n\t" "xvmaddasp 38, 46, %x6 \n\t" - "lxvw4x 46, %15, %2 \n\t" - "lxvw4x %x6, %15, %3 \n\t" + "lxvd2x 46, %15, %2 \n\t" + "lxvd2x %x6, %15, %3 \n\t" "xvmaddasp 39, 47, %x7 \n\t" - "lxvw4x 47, %16, %2 \n\t" - "lxvw4x %x7, %16, %3 \n\t" + "lxvd2x 47, %16, %2 \n\t" + "lxvd2x %x7, %16, %3 \n\t" "addi %2, %2, 128 \n\t" "addi %3, %3, 128 \n\t" diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 0a18c16e0..6eecb60a1 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xscvdpspn 37, %x14 \n\t" // load s to all words "xxspltw 37, 37, 0 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "addi %3, %3, 64 \n\t" "addi %4, %4, 64 \n\t" @@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvmulsp 44, 32, 37 \n\t" // s * x "xvmulsp 45, 33, 37 \n\t" - "lxvw4x 32, 0, %3 \n\t" // load x - "lxvw4x 33, %15, %3 \n\t" + "lxvd2x 32, 0, %3 \n\t" // load x + "lxvd2x 33, %15, %3 \n\t" "xvmulsp 46, 34, 37 \n\t" "xvmulsp 47, 35, 37 \n\t" - "lxvw4x 34, %16, %3 \n\t" - "lxvw4x 35, %17, %3 \n\t" + "lxvd2x 34, %16, %3 \n\t" + "lxvd2x 35, %17, %3 \n\t" "xvmulsp %x9, 48, 37 \n\t" // s * y "xvmulsp %x10, 49, 37 \n\t" - "lxvw4x 48, 0, %4 \n\t" // load y - "lxvw4x 49, %15, %4 \n\t" + "lxvd2x 48, 0, %4 \n\t" // load y + "lxvd2x 49, %15, %4 \n\t" "xvmulsp %x11, 50, 37 \n\t" "xvmulsp %x12, 51, 37 \n\t" - "lxvw4x 50, %16, %4 \n\t" - "lxvw4x 51, %17, %4 \n\t" + "lxvd2x 50, %16, %4 \n\t" + "lxvd2x 51, %17, %4 \n\t" "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y @@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n\t" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n\t" "addi %3, %3, 128 \n\t" "addi %4, %4, 128 \n\t" @@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x - "stxvw4x 40, 0, %3 \n\t" // store x - "stxvw4x 41, %15, %3 \n\t" - "stxvw4x 42, %16, %3 \n\t" - "stxvw4x 43, %17, %3 \n\t" + "stxvd2x 40, 0, %3 \n\t" // store x + "stxvd2x 41, %15, %3 \n\t" + "stxvd2x 42, %16, %3 \n\t" + "stxvd2x 43, %17, %3 \n\t" - "stxvw4x %x5, 0, %4 \n\t" // store y - "stxvw4x %x6, %15, %4 \n\t" - "stxvw4x %x7, %16, %4 \n\t" - "stxvw4x %x8, %17, %4 \n" + "stxvd2x %x5, 0, %4 \n\t" // store y + "stxvd2x %x6, %15, %4 \n\t" + "stxvd2x %x7, %16, %4 \n\t" + "stxvd2x %x8, %17, %4 \n" "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 49862a329..058ff3399 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xscvdpspn %x3, %x3 \n\t" "xxspltw %x3, %x3, 0 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, 128 \n\t" @@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" - "lxvw4x 32, 0, %2 \n\t" - "lxvw4x 33, %4, %2 \n\t" + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %4, %2 \n\t" "xvmulsp 42, 34, %x3 \n\t" "xvmulsp 43, 35, %x3 \n\t" - "lxvw4x 34, %5, %2 \n\t" - "lxvw4x 35, %6, %2 \n\t" + "lxvd2x 34, %5, %2 \n\t" + "lxvd2x 35, %6, %2 \n\t" "xvmulsp 44, 36, %x3 \n\t" "xvmulsp 45, 37, %x3 \n\t" - "lxvw4x 36, %7, %2 \n\t" - "lxvw4x 37, %8, %2 \n\t" + "lxvd2x 36, %7, %2 \n\t" + "lxvd2x 37, %8, %2 \n\t" "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "lxvw4x 38, %9, %2 \n\t" - "lxvw4x 39, %10, %2 \n\t" + "lxvd2x 38, %9, %2 \n\t" + "lxvd2x 39, %10, %2 \n\t" "addi %2, %2, -128 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n\t" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n\t" "addi %2, %2, 256 \n\t" @@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "xvmulsp 46, 38, %x3 \n\t" "xvmulsp 47, 39, %x3 \n\t" - "stxvw4x 40, 0, %2 \n\t" - "stxvw4x 41, %4, %2 \n\t" - "stxvw4x 42, %5, %2 \n\t" - "stxvw4x 43, %6, %2 \n\t" - "stxvw4x 44, %7, %2 \n\t" - "stxvw4x 45, %8, %2 \n\t" - "stxvw4x 46, %9, %2 \n\t" - "stxvw4x 47, %10, %2 \n" + "stxvd2x 40, 0, %2 \n\t" + "stxvd2x 41, %4, %2 \n\t" + "stxvd2x 42, %5, %2 \n\t" + "stxvd2x 43, %6, %2 \n\t" + "stxvd2x 44, %7, %2 \n\t" + "stxvd2x 45, %8, %2 \n\t" + "stxvd2x 46, %9, %2 \n\t" + "stxvd2x 47, %10, %2 \n" "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : @@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x) ".p2align 5 \n" "1: \n\t" - "stxvw4x %x3, 0, %2 \n\t" - "stxvw4x %x3, %4, %2 \n\t" - "stxvw4x %x3, %5, %2 \n\t" - "stxvw4x %x3, %6, %2 \n\t" - "stxvw4x %x3, %7, %2 \n\t" - "stxvw4x %x3, %8, %2 \n\t" - "stxvw4x %x3, %9, %2 \n\t" - "stxvw4x %x3, %10, %2 \n\t" + "stxvd2x %x3, 0, %2 \n\t" + "stxvd2x %x3, %4, %2 \n\t" + "stxvd2x %x3, %5, %2 \n\t" + "stxvd2x %x3, %6, %2 \n\t" + "stxvd2x %x3, %7, %2 \n\t" + "stxvd2x %x3, %8, %2 \n\t" + "stxvd2x %x3, %9, %2 \n\t" + "stxvd2x %x3, %10, %2 \n\t" "addi %2, %2, 128 \n\t" diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index d44f16765..cfefdd6ef 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y) ".p2align 5 \n" "1: \n\t" - "lxvw4x 32, 0, %4 \n\t" - "lxvw4x 33, %5, %4 \n\t" - "lxvw4x 34, %6, %4 \n\t" - "lxvw4x 35, %7, %4 \n\t" - "lxvw4x 36, %8, %4 \n\t" - "lxvw4x 37, %9, %4 \n\t" - "lxvw4x 38, %10, %4 \n\t" - "lxvw4x 39, %11, %4 \n\t" + "lxvd2x 32, 0, %4 \n\t" + "lxvd2x 33, %5, %4 \n\t" + "lxvd2x 34, %6, %4 \n\t" + "lxvd2x 35, %7, %4 \n\t" + "lxvd2x 36, %8, %4 \n\t" + "lxvd2x 37, %9, %4 \n\t" + "lxvd2x 38, %10, %4 \n\t" + "lxvd2x 39, %11, %4 \n\t" - "lxvw4x 40, 0, %3 \n\t" - "lxvw4x 41, %5, %3 \n\t" - "lxvw4x 42, %6, %3 \n\t" - "lxvw4x 43, %7, %3 \n\t" - "lxvw4x 44, %8, %3 \n\t" - "lxvw4x 45, %9, %3 \n\t" - "lxvw4x 46, %10, %3 \n\t" - "lxvw4x 47, %11, %3 \n\t" + "lxvd2x 40, 0, %3 \n\t" + "lxvd2x 41, %5, %3 \n\t" + "lxvd2x 42, %6, %3 \n\t" + "lxvd2x 43, %7, %3 \n\t" + "lxvd2x 44, %8, %3 \n\t" + "lxvd2x 45, %9, %3 \n\t" + "lxvd2x 46, %10, %3 \n\t" + "lxvd2x 47, %11, %3 \n\t" - "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 33, %5, %3 \n\t" - "stxvw4x 34, %6, %3 \n\t" - "stxvw4x 35, %7, %3 \n\t" - "stxvw4x 36, %8, %3 \n\t" - "stxvw4x 37, %9, %3 \n\t" - "stxvw4x 38, %10, %3 \n\t" - "stxvw4x 39, %11, %3 \n\t" + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" "addi %3, %3, 128 \n\t" - "stxvw4x 40, 0, %4 \n\t" - "stxvw4x 41, %5, %4 \n\t" - "stxvw4x 42, %6, %4 \n\t" - "stxvw4x 43, %7, %4 \n\t" - "stxvw4x 44, %8, %4 \n\t" - "stxvw4x 45, %9, %4 \n\t" - "stxvw4x 46, %10, %4 \n\t" - "stxvw4x 47, %11, %4 \n\t" + "stxvd2x 40, 0, %4 \n\t" + "stxvd2x 41, %5, %4 \n\t" + "stxvd2x 42, %6, %4 \n\t" + "stxvd2x 43, %7, %4 \n\t" + "stxvd2x 44, %8, %4 \n\t" + "stxvd2x 45, %9, %4 \n\t" + "stxvd2x 46, %10, %4 \n\t" + "stxvd2x 47, %11, %4 \n\t" "addi %4, %4, 128 \n\t" diff --git a/relapack/LICENSE b/relapack/LICENSE new file mode 100644 index 000000000..edeb4046e --- /dev/null +++ b/relapack/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2016 Elmar Peise + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/relapack/Makefile b/relapack/Makefile new file mode 100644 index 000000000..ddf101bd1 --- /dev/null +++ b/relapack/Makefile @@ -0,0 +1,98 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + + + +SRC = $(wildcard src/*.c) + +SRC1 = \ + src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \ + src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \ + src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \ + src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c + +SRC2 = \ + src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ + src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ + src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ + src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ + src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ + src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \ + src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \ + src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \ + src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c + +SRCX = \ + src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ + src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ + src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ + src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ + src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ + src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \ + src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \ + src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \ + src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c + +OBJS1 = $(SRC1:%.c=%.$(SUFFIX)) +OBJS2 = $(SRC2:%.c=%.o) +OBJS = $(OBJS1) $(OBJS2) + +TEST_SUITS = \ + slauum dlauum clauum zlauum \ + spotrf dpotrf cpotrf zpotrf \ + spbtrf dpbtrf cpbtrf zpbtrf \ + ssygst dsygst chegst zhegst \ + ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ + sgetrf dgetrf cgetrf zgetrf \ + sgbtrf dgbtrf cgbtrf zgbtrf \ + strsyl dtrsyl ctrsyl ztrsyl \ + stgsyl dtgsyl ctgsyl ztgsyl \ + sgemmt dgemmt cgemmt zgemmt +TESTS = $(TEST_SUITS:%=test/%.pass) # dummies +TEST_EXES = $(TEST_SUITS:%=test/%.x) + +LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm + +.SECONDARY: $(TEST_EXES) +.PHONY: test + +# ReLAPACK compilation + +libs: $(OBJS) + @echo "Building ReLAPACK library $(LIBNAME)" + $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) + $(RANLIB) $(TOPDIR)/$(LIBNAME) + +%.$(SUFFIX): %.c config.h + $(CC) $(CFLAGS) -c $< -o $@ + +%.o: %.c config.h + $(CC) $(CFLAGS) -c $< -o $@ + + +# ReLAPACK testing + +test: $(TEST_EXES) $(TESTS) + @echo "passed all tests" + +test/%.pass: test/%.x + @echo -n $*: + @./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<) + +test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + +test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + +test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + +test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h + $(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) + + +# cleaning up + +clean: + rm -f $(OBJS) test/util.$(SUFFIX) test/*.x diff --git a/relapack/README.md b/relapack/README.md new file mode 100644 index 000000000..1947c1748 --- /dev/null +++ b/relapack/README.md @@ -0,0 +1,68 @@ +ReLAPACK +======== + +[![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK) + +[Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK) + +ReLAPACK offers a collection of recursive algorithms for many of LAPACK's +compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK +integrates effortlessly into existing application codes. ReLAPACK's routines +not only outperform the reference LAPACK but also improve upon the performance +of tuned implementations, such as OpenBLAS and MKL. + + +Coverage +-------- +For a detailed list of covered operations and an overview of operations to which +recursion is not efficiently applicable, see [coverage.md](coverage.md). + + +Installation +------------ +To compile with the default configuration, simply run `make` to create the +library `librelapack.a`. + +### Linking with MKL +Note that to link with MKL, you currently need to set the flag +`COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and +`ztrsyl`. For further configuration options see [config.md](config.md). + + +### Dependencies +ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked +kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized +and machine specific implementations of these libraries, which are commonly +provided by hardware vendors or available as open source (e.g., +[OpenBLAS](http://www.openblas.net/)). + + +Testing +------- +ReLAPACK's test suite compares its routines numerically with LAPACK's +counterparts. To set up the tests (located int `test/`) you need to specify +link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then +`make test` runs the tests. For details on the performed tests, see +[test/README.md](test/README.md). + + +Examples +-------- +Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the +covered routines applies directly to ReLAPACK. A few separate examples are +given in `examples/`. For details, see [examples/README.md](examples/README.md). + + +Citing +------ +When referencing ReLAPACK, please cite the preprint of the paper +[Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763): + + @article{relapack, + author = {Elmar Peise and Paolo Bientinesi}, + title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection}, + journal = {CoRR}, + volume = {abs/1602.06763}, + year = {2016}, + url = {http://arxiv.org/abs/1602.06763}, + } diff --git a/relapack/config.h b/relapack/config.h new file mode 100644 index 000000000..9113a712d --- /dev/null +++ b/relapack/config.h @@ -0,0 +1,208 @@ +#ifndef RELAPACK_CONFIG_H +#define RELAPACK_CONFIG_H + +// ReLAPACK configuration file. +// See also config.md + + +/////////////////////////////// +// BLAS/LAPACK obect symbols // +/////////////////////////////// + +// BLAS routines linked against have a trailing underscore +#define BLAS_UNDERSCORE 1 +// LAPACK routines linked against have a trailing underscore +#define LAPACK_UNDERSCORE BLAS_UNDERSCORE + +// Complex BLAS/LAPACK routines return their result in the first argument +// This option must be enabled when linking to MKL for ctrsyl and ztrsyl to +// work. +#define COMPLEX_FUNCTIONS_AS_ROUTINES 0 +#ifdef F_INTERFACE_INTEL +#define COMPLEX_FUNCTIONS_AS_ROUTINES 1 +#endif +#define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES +#define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES + +// The BLAS-like extension xgemmt is provided by an external library. +#define HAVE_XGEMMT 0 + + +//////////////////////////// +// Use malloc in ReLAPACK // +//////////////////////////// + +#define ALLOW_MALLOC 1 +// allow malloc in xsygst for improved performance +#define XSYGST_ALLOW_MALLOC ALLOW_MALLOC +// allow malloc in xsytrf if the passed work buffer is too small +#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC + + +//////////////////////////////// +// LAPACK routine replacement // +//////////////////////////////// +// The following macros specify which routines are included in the library under +// LAPACK's symbol names: 1 included, 0 not included + +#define INCLUDE_ALL 1 + +#define INCLUDE_XLAUUM INCLUDE_ALL +#define INCLUDE_SLAUUM INCLUDE_XLAUUM +#define INCLUDE_DLAUUM INCLUDE_XLAUUM +#define INCLUDE_CLAUUM INCLUDE_XLAUUM +#define INCLUDE_ZLAUUM INCLUDE_XLAUUM + +#define INCLUDE_XSYGST INCLUDE_ALL +#define INCLUDE_SSYGST INCLUDE_XSYGST +#define INCLUDE_DSYGST INCLUDE_XSYGST +#define INCLUDE_CHEGST INCLUDE_XSYGST +#define INCLUDE_ZHEGST INCLUDE_XSYGST + +#define INCLUDE_XTRTRI INCLUDE_ALL +#define INCLUDE_STRTRI INCLUDE_XTRTRI +#define INCLUDE_DTRTRI INCLUDE_XTRTRI +#define INCLUDE_CTRTRI INCLUDE_XTRTRI +#define INCLUDE_ZTRTRI INCLUDE_XTRTRI + +#define INCLUDE_XPOTRF INCLUDE_ALL +#define INCLUDE_SPOTRF INCLUDE_XPOTRF +#define INCLUDE_DPOTRF INCLUDE_XPOTRF +#define INCLUDE_CPOTRF INCLUDE_XPOTRF +#define INCLUDE_ZPOTRF INCLUDE_XPOTRF + +#define INCLUDE_XPBTRF INCLUDE_ALL +#define INCLUDE_SPBTRF INCLUDE_XPBTRF +#define INCLUDE_DPBTRF INCLUDE_XPBTRF +#define INCLUDE_CPBTRF INCLUDE_XPBTRF +#define INCLUDE_ZPBTRF INCLUDE_XPBTRF + +#define INCLUDE_XSYTRF INCLUDE_ALL +#define INCLUDE_SSYTRF INCLUDE_XSYTRF +#define INCLUDE_DSYTRF INCLUDE_XSYTRF +#define INCLUDE_CSYTRF INCLUDE_XSYTRF +#define INCLUDE_CHETRF INCLUDE_XSYTRF +#define INCLUDE_ZSYTRF INCLUDE_XSYTRF +#define INCLUDE_ZHETRF INCLUDE_XSYTRF +#define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF +#define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF +#define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF +#define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF +#define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF +#define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF + +#define INCLUDE_XGETRF INCLUDE_ALL +#define INCLUDE_SGETRF INCLUDE_XGETRF +#define INCLUDE_DGETRF INCLUDE_XGETRF +#define INCLUDE_CGETRF INCLUDE_XGETRF +#define INCLUDE_ZGETRF INCLUDE_XGETRF + +#define INCLUDE_XGBTRF INCLUDE_ALL +#define INCLUDE_SGBTRF INCLUDE_XGBTRF +#define INCLUDE_DGBTRF INCLUDE_XGBTRF +#define INCLUDE_CGBTRF INCLUDE_XGBTRF +#define INCLUDE_ZGBTRF INCLUDE_XGBTRF + +#define INCLUDE_XTRSYL INCLUDE_ALL +#define INCLUDE_STRSYL INCLUDE_XTRSYL +#define INCLUDE_DTRSYL INCLUDE_XTRSYL +#define INCLUDE_CTRSYL INCLUDE_XTRSYL +#define INCLUDE_ZTRSYL INCLUDE_XTRSYL + +#define INCLUDE_XTGSYL INCLUDE_ALL +#define INCLUDE_STGSYL INCLUDE_XTGSYL +#define INCLUDE_DTGSYL INCLUDE_XTGSYL +#define INCLUDE_CTGSYL INCLUDE_XTGSYL +#define INCLUDE_ZTGSYL INCLUDE_XTGSYL + +#define INCLUDE_XGEMMT 0 +#define INCLUDE_SGEMMT INCLUDE_XGEMMT +#define INCLUDE_DGEMMT INCLUDE_XGEMMT +#define INCLUDE_CGEMMT INCLUDE_XGEMMT +#define INCLUDE_ZGEMMT INCLUDE_XGEMMT + + +///////////////////// +// crossover sizes // +///////////////////// + +// default crossover size +#define CROSSOVER 24 + +// individual crossover sizes +#define CROSSOVER_XLAUUM CROSSOVER +#define CROSSOVER_SLAUUM CROSSOVER_XLAUUM +#define CROSSOVER_DLAUUM CROSSOVER_XLAUUM +#define CROSSOVER_CLAUUM CROSSOVER_XLAUUM +#define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM + +#define CROSSOVER_XSYGST CROSSOVER +#define CROSSOVER_SSYGST CROSSOVER_XSYGST +#define CROSSOVER_DSYGST CROSSOVER_XSYGST +#define CROSSOVER_CHEGST CROSSOVER_XSYGST +#define CROSSOVER_ZHEGST CROSSOVER_XSYGST + +#define CROSSOVER_XTRTRI CROSSOVER +#define CROSSOVER_STRTRI CROSSOVER_XTRTRI +#define CROSSOVER_DTRTRI CROSSOVER_XTRTRI +#define CROSSOVER_CTRTRI CROSSOVER_XTRTRI +#define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI + +#define CROSSOVER_XPOTRF CROSSOVER +#define CROSSOVER_SPOTRF CROSSOVER_XPOTRF +#define CROSSOVER_DPOTRF CROSSOVER_XPOTRF +#define CROSSOVER_CPOTRF CROSSOVER_XPOTRF +#define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF + +#define CROSSOVER_XPBTRF CROSSOVER +#define CROSSOVER_SPBTRF CROSSOVER_XPBTRF +#define CROSSOVER_DPBTRF CROSSOVER_XPBTRF +#define CROSSOVER_CPBTRF CROSSOVER_XPBTRF +#define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF + +#define CROSSOVER_XSYTRF CROSSOVER +#define CROSSOVER_SSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_DSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_CSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_CHETRF CROSSOVER_XSYTRF +#define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF +#define CROSSOVER_ZHETRF CROSSOVER_XSYTRF +#define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF +#define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF +#define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF +#define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF +#define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF +#define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF + +#define CROSSOVER_XGETRF CROSSOVER +#define CROSSOVER_SGETRF CROSSOVER_XGETRF +#define CROSSOVER_DGETRF CROSSOVER_XGETRF +#define CROSSOVER_CGETRF CROSSOVER_XGETRF +#define CROSSOVER_ZGETRF CROSSOVER_XGETRF + +#define CROSSOVER_XGBTRF CROSSOVER +#define CROSSOVER_SGBTRF CROSSOVER_XGBTRF +#define CROSSOVER_DGBTRF CROSSOVER_XGBTRF +#define CROSSOVER_CGBTRF CROSSOVER_XGBTRF +#define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF + +#define CROSSOVER_XTRSYL CROSSOVER +#define CROSSOVER_STRSYL CROSSOVER_XTRSYL +#define CROSSOVER_DTRSYL CROSSOVER_XTRSYL +#define CROSSOVER_CTRSYL CROSSOVER_XTRSYL +#define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL + +#define CROSSOVER_XTGSYL CROSSOVER +#define CROSSOVER_STGSYL CROSSOVER_XTGSYL +#define CROSSOVER_DTGSYL CROSSOVER_XTGSYL +#define CROSSOVER_CTGSYL CROSSOVER_XTGSYL +#define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL + +// sytrf helper routine +#define CROSSOVER_XGEMMT CROSSOVER_XSYTRF +#define CROSSOVER_SGEMMT CROSSOVER_XGEMMT +#define CROSSOVER_DGEMMT CROSSOVER_XGEMMT +#define CROSSOVER_CGEMMT CROSSOVER_XGEMMT +#define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT + +#endif /* RELAPACK_CONFIG_H */ diff --git a/relapack/config.md b/relapack/config.md new file mode 100644 index 000000000..ea14be16a --- /dev/null +++ b/relapack/config.md @@ -0,0 +1,87 @@ +RELAPACK Configuration +====================== + +ReLAPACK has two configuration files: `make.inc`, which is included by the +Makefile, and `config.h` which is included in the source files. + + +Build and Testing Environment +----------------------------- +The build environment (compiler and flags) and the test configuration (linker +flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size +and error bounds are defined in `test/config.h`. + +The library `librelapack.a` is compiled by invoking `make`. The tests are +performed by either `make test` or calling `make` in the test folder. + + +BLAS/LAPACK complex function interfaces +--------------------------------------- +For BLAS and LAPACK functions that return a complex number, there exist two +conflicting (FORTRAN compiler dependent) calling conventions: either the result +is returned as a `struct` of two floating point numbers or an additional first +argument with a pointer to such a `struct` is used. By default ReLAPACK uses +the former (which is what gfortran uses), but it can switch to the latter by +setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK +specific counterparts) to `1` in `config.h`. + +**For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.** + +(Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases +will segfault or return errors on the order of 1 or larger.) + + +BLAS extension `xgemmt` +----------------------- +The LDL decompositions require a general matrix-matrix product that updates only +a triangular matrix called `xgemmt`. If the BLAS implementation linked against +provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`; +otherwise, ReLAPACK uses its own recursive implementation of these kernels. + +`xgemmt` is provided by MKL. + + +Routine Selection +----------------- +ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the +corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to +`1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g., +`dgetrf_`). By default, wrappers for all routines are enabled. + + +Crossover Size +-------------- +The crossover size determines below which matrix sizes ReLAPACK's recursive +algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3 +routines. The crossover size is set in `config.h` and can be chosen either +globally for the entire library, by operation, or individually by routine. + + +Allowing Temporary Buffers +-------------------------- +Two of ReLAPACK's routines make use of temporary buffers, which are allocated +and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine +specific counterparts) to 0 in `config.h` will disable these buffers. The +affected routines are: + + * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in + LAPACK, this size can be queried by setting `lWork = -1` and the passed + buffer will be used if it is large enough; only if it is not, a local buffer + will be allocated. + + The advantage of this mechanism is that ReLAPACK will seamlessly work even + with codes that statically provide too little memory instead of breaking + them. + + * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem + to standard form can use an auxiliary buffer of size n^2 / 2 to avoid + redundant computations. It thereby performs about 30% less FLOPs than + LAPACK. + + +FORTRAN symbol names +-------------------- +ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces. +Since these libraries usually have an underscore to their symbol names, ReLAPACK +has configuration switches in `config.h` to adjust the corresponding routine +names. diff --git a/relapack/coverage.md b/relapack/coverage.md new file mode 100644 index 000000000..8406b2078 --- /dev/null +++ b/relapack/coverage.md @@ -0,0 +1,212 @@ +Coverage of ReLAPACK +==================== + +This file lists all LAPACK compute routines that are covered by recursive +algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which +are not (yet) part of ReLAPACK. + +<!-- START doctoc generated TOC please keep comment here to allow auto update --> +<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE --> +**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* + +- [List of covered LAPACK routines](#list-of-covered-lapack-routines) + - [`xlauum`](#xlauum) + - [`xsygst`](#xsygst) + - [`xtrtri`](#xtrtri) + - [`xpotrf`](#xpotrf) + - [`xpbtrf`](#xpbtrf) + - [`xsytrf`](#xsytrf) + - [`xgetrf`](#xgetrf) + - [`xgbtrf`](#xgbtrf) + - [`xtrsyl`](#xtrsyl) + - [`xtgsyl`](#xtgsyl) +- [Covered BLAS extension](#covered-blas-extension) + - [`xgemmt`](#xgemmt) +- [Not covered yet](#not-covered-yet) + - [`xpstrf`](#xpstrf) +- [Not covered: extra FLOPs](#not-covered-extra-flops) + - [QR decomposition (and related)](#qr-decomposition-and-related) + - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal) + - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal) + - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg) + +<!-- END doctoc generated TOC please keep comment here to allow auto update --> + + +List of covered LAPACK routines +------------------------------- + +### `xlauum` +Multiplication of a triangular matrix with its (complex conjugate) transpose, +resulting in a symmetric (Hermitian) matrix. + +Routines: `slauum`, `dlauum`, `clauum`, `zlauum` + +Operations: +* A = L^T L +* A = U U^T + +### `xsygst` +Simultaneous two-sided multiplication of a symmetric matrix with a triangular +matrix and its transpose + +Routines: `ssygst`, `dsygst`, `chegst`, `zhegst` + +Operations: +* A = inv(L) A inv(L^T) +* A = inv(U^T) A inv(U) +* A = L^T A L +* A = U A U^T + +### `xtrtri` +Inversion of a triangular matrix + +Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri` + +Operations: +* L = inv(L) +* U = inv(U) + +### `xpotrf` +Cholesky decomposition of a symmetric (Hermitian) positive definite matrix + +Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf` + +Operations: +* L L^T = A +* U^T U = A + +### `xpbtrf` +Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix + +Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf` + +Operations: +* L L^T = A +* U^T U = A + +### `xsytrf` +LDL decomposition of a symmetric (or Hermitian) matrix + +Routines: +* `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`, +* `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`, + `zhetrf_rook` + +Operations: +* L D L^T = A +* U^T D U = A + +### `xgetrf` +LU decomposition of a general matrix with pivoting + +Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf` + +Operation: P L U = A + +### `xgbtrf` +LU decomposition of a general banded matrix with pivoting + +Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf` + +Operation: L U = A + +### `xtrsyl` +Solution of the quasi-triangular Sylvester equation + +Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl` + +Operations: +* A X + B Y = C -> X +* A^T X + B Y = C -> X +* A X + B^T Y = C -> X +* A^T X + B^T Y = C -> X +* A X - B Y = C -> X +* A^T X - B Y = C -> X +* A X - B^T Y = C -> X +* A^T X - B^T Y = C -> X + +### `xtgsyl` +Solution of the generalized Sylvester equations + +Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl` + +Operations: +* A R - L B = C, D R - L E = F -> L, R +* A^T R + D^T L = C, R B^T - L E^T = -F -> L, R + + +Covered BLAS extension +---------------------- + +### `xgemmt` +Matrix-matrix product updating only a triangular part of the result + +Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt` + +Operations: +* C = alpha A B + beta C +* C = alpha A B^T + beta C +* C = alpha A^T B + beta C +* C = alpha A^T B^T + beta C + + +Not covered yet +--------------- +The following operation is implemented as a blocked algorithm in LAPACK but +currently not yet covered in ReLAPACK as a recursive algorithm + +### `xpstrf` +Cholesky decomposition of a positive semi-definite matrix with complete pivoting. + +Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf` + +Operations: +* P L L^T P^T = A +* P U^T U P^T = A + + +Not covered: extra FLOPs +------------------------ +The following routines are not covered because recursive variants would require +considerably more FLOPs or operate on banded matrices. + +### QR decomposition (and related) +Routines: +* `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf` +* `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf` +* `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf` +* `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf` +* `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf` + +Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A + +Routines for multiplication with Q: +* `sormqr`, `dormqr`, `cunmqr`, `zunmqr` +* `sormrq`, `dormrq`, `cunmrq`, `zunmrq` +* `sormql`, `dormql`, `cunmql`, `zunmql` +* `sormlq`, `dormlq`, `cunmlq`, `zunmlq` +* `sormrz`, `dormrz`, `cunmrz`, `zunmrz` + +Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T + +Routines for construction of Q: +* `sorgqr`, `dorgqr`, `cungqr`, `zungqr` +* `sorgrq`, `dorgrq`, `cungrq`, `zungrq` +* `sorgql`, `dorgql`, `cungql`, `zungql` +* `sorglq`, `dorglq`, `cunglq`, `zunglq` + +### Symmetric reduction to tridiagonal +Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd` + +Operation: Q T Q^T = A + +### Symmetric reduction to bidiagonal +Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd` + +Operation: Q T P^T = A + +### Reduction to upper Hessenberg +Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd` + +Operation: Q H Q^T = A diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h new file mode 100644 index 000000000..e421f352b --- /dev/null +++ b/relapack/inc/relapack.h @@ -0,0 +1,67 @@ +#ifndef RELAPACK_H +#define RELAPACK_H + +void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); +void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); +void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); +void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); + +void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); +void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); +void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); +void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); + +void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); +void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); +void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); +void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); + +void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); +void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); +void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); +void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); + +void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); + +void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); + +void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); + +void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); + +void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); + +void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); +void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); + +void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +#endif /* RELAPACK_H */ diff --git a/relapack/src/blas.h b/relapack/src/blas.h new file mode 100644 index 000000000..7441c1033 --- /dev/null +++ b/relapack/src/blas.h @@ -0,0 +1,61 @@ +#ifndef BLAS_H +#define BLAS_H + +extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); +extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); +extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); +extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); + +extern void BLAS(sscal)(const int *, const float *, float *, const int *); +extern void BLAS(dscal)(const int *, const double *, double *, const int *); +extern void BLAS(cscal)(const int *, const float *, float *, const int *); +extern void BLAS(zscal)(const int *, const double *, double *, const int *); + +extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); + +extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); + +extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); +extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); +extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); +extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); + +extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +#if HAVE_XGEMMT +extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +#endif + +#endif /* BLAS_H */ diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c new file mode 100644 index 000000000..90b2c8789 --- /dev/null +++ b/relapack/src/cgbtrf.c @@ -0,0 +1,230 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, + const int *, float *, const int *, int *, float *, const int *, float *, + const int *, int *); + + +/** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's cgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html + * */ +void RELAPACK_cgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CGBTRF", &minfo); + return; + } + + // Constant + const float ZERO[] = { 0., 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + 2 * kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + float *const A_j = A + 2 * *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[2 * i] = A_j[2 * i + 1] = 0.; + } + + // Allocate work space + const int n1 = CREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); + float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); + LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** cgbtrf's recursive compute kernel */ +static void RELAPACK_cgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { + // Unblocked + LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + 2 * kv; + + // Splitting + const int n1 = MIN(CREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + float *const Ab_L = Ab; + float *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * m1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + float *const A_Rl = A_R; + float *const A_Rr = A_R + 2 * *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + 2 * *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + 2 * m21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + 2 * *ldA * n21; + float *const A_BRbl = A_BR + 2 * m21; + float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + float *const A_Rrj = A_Rr + 2 * *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const float tmpr = A_Rrj[2 * i]; + const float tmpc = A_Rrj[2 * i + 1]; + A_Rrj[2 * i] = A_Rrj[2 * ip]; + A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1]; + A_Rrj[2 * ip] = tmpr; + A_Rrj[2 * ip + 1] = tmpc; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c new file mode 100644 index 000000000..28e2b00b0 --- /dev/null +++ b/relapack/src/cgemmt.c @@ -0,0 +1,167 @@ +#include "relapack.h" + +static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + +static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + + +/** CGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_cgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int ctransA = LAPACK(lsame)(transA, "C"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + const int ctransB = LAPACK(lsame)(transB, "C"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !ctransA && !notransA) + info = 2; + else if (!tranB && !ctransB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("CGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); + const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** cgemmt's recursive compute kernel */ +static void RELAPACK_cgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { + // Unblocked + RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const float *const A_T = A; + const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const float *const B_L = B; + const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + float *const C_TL = C; + float *const C_TR = C + 2 * *ldC * n1; + float *const C_BL = C + 2 * n1; + float *const C_BR = C + 2 * *ldC * n1 + 2 * n1; + + // recursion(C_TL) + RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** cgemmt's unblocked compute kernel */ +static void RELAPACK_cgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const float *const A_0 = A; + const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + float *const C_0i = C + 2 * *ldC * i; + float *const C_ii = C + 2 * *ldC * i + 2 * i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c new file mode 100644 index 000000000..b31a711d0 --- /dev/null +++ b/relapack/src/cgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_cgetrf_rec(const int *, const int *, float *, + const int *, int *, int *); + + +/** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's cgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html + */ +void RELAPACK_cgetrf( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const float ONE[] = { 1., 0. }; + const int iONE[] = { 1 }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const float *const A_L = A; + float *const A_R = A + 2 * *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_L \ A_R + BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** cgetrf's recursive compute kernel */ +static void RELAPACK_cgetrf_rec( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_CGETRF, 1)) { + // Unblocked + LAPACK(cgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c new file mode 100644 index 000000000..dff875017 --- /dev/null +++ b/relapack/src/chegst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_chegst_rec(const int *, const char *, const int *, + float *, const int *, const float *, const int *, + float *, const int *, int *); + + +/** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's chegst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html + * */ +void RELAPACK_chegst( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CHEGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + float *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = CREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * 2 * sizeof(float)); + if (!Work) + lWork = 0; +#endif + + // recursive kernel + RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** chegst's recursive compute kernel */ +static void RELAPACK_chegst_rec( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + float *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_CHEGST, 1)) { + // Unblocked + LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const float ZERO[] = { 0., 0. }; + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const float HALF[] = { .5, 0. }; + const float MHALF[] = { -.5, 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // B_TL B_TR + // B_BL B_BR + const float *const B_TL = B; + const float *const B_TR = B + 2 * *ldB * n1; + const float *const B_BL = B + 2 * n1; + const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // recursion(A_TL, B_TL) + RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c new file mode 100644 index 000000000..2928235e4 --- /dev/null +++ b/relapack/src/chetrf.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's chetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html + * */ +void RELAPACK_chetrf( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** chetrf's recursive compute kernel */ +static void RELAPACK_chetrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CHETRF, 3)) { + // Unblocked + if (top) { + LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c new file mode 100644 index 000000000..b5c8341b6 --- /dev/null +++ b/relapack/src/chetrf_rec2.c @@ -0,0 +1,520 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method + * + * This routine is a minor modification of LAPACK's clahef. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * + nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, + int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2, r__3, r__4; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k; + static float t, r1; + static complex d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen), ccopy_(int *, complex *, int *, + complex *, int *), cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float absakk; + extern /* Subroutine */ int clacgv_(int *, complex *, int *); + extern int icamax_(int *, complex *, int *); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int + *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + i__1 = k - 1; + ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - 1; + ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (kw - 1) * w_dim1]), dabs(r__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kk - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + r1 = 1.f / a[i__1].r; + i__1 = k - 1; + csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + kw * w_dim1], &q__2); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + q__2.r = t, q__2.i = 0.f; + c_div(&q__1, &q__2, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + r_cnjg(&q__2, &d21); + i__3 = j + kw * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = + q__2.r * q__3.i + q__2.i * q__3.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (imax < *n) { + i__1 = *n - imax; + ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (k + 1) * w_dim1]), dabs(r__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kp - kk - 1; + ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + r1 = 1.f / a[i__1].r; + i__1 = *n - k; + csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + k * w_dim1], &q__2); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + q__2.r = t, q__2.i = 0.f; + c_div(&q__1, &q__2, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + r_cnjg(&q__2, &d21); + i__3 = j + k * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = + q__2.r * q__3.i + q__2.i * q__3.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c new file mode 100644 index 000000000..086393d57 --- /dev/null +++ b/relapack/src/chetrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's chetrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html + * */ +void RELAPACK_chetrf_rook( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** chetrf_rook's recursive compute kernel */ +static void RELAPACK_chetrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CHETRF, 3)) { + // Unblocked + if (top) { + LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c new file mode 100644 index 000000000..a42cbfd44 --- /dev/null +++ b/relapack/src/chetrf_rook_rec2.c @@ -0,0 +1,661 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method + * + * This routine is a minor modification of LAPACK's clahef_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, complex *a, int *lda, int *ipiv, + complex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2; + complex q__1, q__2, q__3, q__4, q__5; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k, p; + static float t, r1; + static complex d11, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen); + static float sfmin; + extern /* Subroutine */ int ccopy_(int *, complex *, int *, + complex *, int *); + static int itemp; + extern /* Subroutine */ int cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float stemp, absakk; + extern /* Subroutine */ int clacgv_(int *, complex *, int *); + extern int icamax_(int *, complex *, int *); + extern double slamch_(char *, ftnlen); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int + *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + sfmin = slamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + if (k > 1) { + i__1 = k - 1; + ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & + c__1); + } + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + r__1 = w[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + if (k > 1) { + i__1 = k - 1; + ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], + &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + if (imax > 1) { + i__1 = imax - 1; + ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + } + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = k - 1 - p; + ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + i__1 = k - 1 - p; + clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); + if (p > 1) { + i__1 = p - 1; + ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kk - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (dabs(t) >= sfmin) { + r1 = 1.f / t; + i__1 = k - 1; + csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L14: */ + } + } + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + kw * w_dim1], &q__2); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + r_cnjg(&q__5, &d21); + c_div(&q__2, &q__3, &q__5); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + ++jj; + if (kstep == 2 && jp1 != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + r__1 = w[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + if (k < *n) { + i__1 = *n - k; + ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * + a_dim1], &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + r__1 = a[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + if (imax < *n) { + i__1 = *n - imax; + ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + r__1 = w[i__2].r; + w[i__1].r = r__1, w[i__1].i = 0.f; + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (k + 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (k + 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = p - k - 1; + ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * + a_dim1], lda); + i__1 = p - k - 1; + clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); + if (p < *n) { + i__1 = *n - p; + ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p + * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + } + cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + r__1 = a[i__2].r; + a[i__1].r = r__1, a[i__1].i = 0.f; + i__1 = kp - kk - 1; + ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (dabs(t) >= sfmin) { + r1 = 1.f / t; + i__1 = *n - k; + csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L74: */ + } + } + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + r_cnjg(&q__2, &d21); + c_div(&q__1, &w[k + k * w_dim1], &q__2); + d22.r = q__1.r, d22.i = q__1.i; + q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1.f / (q__1.r - 1.f); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + r_cnjg(&q__5, &d21); + c_div(&q__2, &q__3, &q__5); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t * q__2.r, q__1.i = t * q__2.i; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + --jj; + if (kstep == 2 && jp1 != jj && j >= 1) { + cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c new file mode 100644 index 000000000..36d6297cf --- /dev/null +++ b/relapack/src/clauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_clauum_rec(const char *, const int *, float *, + const int *, int *); + + +/** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's clauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html + * */ +void RELAPACK_clauum( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** clauum's recursive compute kernel */ +static void RELAPACK_clauum_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { + // Unblocked + LAPACK(clauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c new file mode 100644 index 000000000..e0ea7b944 --- /dev/null +++ b/relapack/src/cpbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, + float *, const int *, float *, const int *, int *); + + +/** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's cpbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html + * */ +void RELAPACK_cpbtrf( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const float ZERO[] = { 0., 0. }; + + // Allocate work space + const int n1 = CREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + float *Work = malloc(mWork * nWork * 2 * sizeof(float)); + LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** cpbtrf's recursive compute kernel */ +static void RELAPACK_cpbtrf_rec( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + float *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { + // Unblocked + LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(CREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + float *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + 2 * *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + 2 * n21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + 2 * *ldA * n21; + float *const A_BRbl = A_BR + 2 * n21; + float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c new file mode 100644 index 000000000..e35caa7fa --- /dev/null +++ b/relapack/src/cpotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_cpotrf_rec(const char *, const int *, float *, + const int *, int *); + + +/** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's cpotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html + * */ +void RELAPACK_cpotrf( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** cpotrf's recursive compute kernel */ +static void RELAPACK_cpotrf_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { + // Unblocked + LAPACK(cpotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c new file mode 100644 index 000000000..01c161d1a --- /dev/null +++ b/relapack/src/csytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's csytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html + * */ +void RELAPACK_csytrf( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** csytrf's recursive compute kernel */ +static void RELAPACK_csytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c new file mode 100644 index 000000000..9d6bd849d --- /dev/null +++ b/relapack/src/csytrf_rec2.c @@ -0,0 +1,451 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's clasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * + nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, + int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2, r__3, r__4; + complex q__1, q__2, q__3; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k; + static complex t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static float alpha; + extern /* Subroutine */ int cscal_(int *, complex *, complex *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen), ccopy_(int *, complex *, int *, + complex *, int *), cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float absakk; + extern int icamax_(int *, complex *, int *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * + w_dim1]), dabs(r__2)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (kw - 1) * w_dim1]), dabs(r__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha * + rowmax) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kk - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = k - 1; + cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + kw * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + c_div(&q__1, &t, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * + w_dim1]), dabs(r__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + jmax + (k + 1) * w_dim1]), dabs(r__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( + r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( + r__2)); + rowmax = dmax(r__3,r__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha * + rowmax) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - kk - 1; + ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = *n - k; + cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k + k * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + c_div(&q__1, &t, &d21); + d21.r = q__1.r, d21.i = q__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] + .i; + q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = + d21.r * q__2.i + d21.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c new file mode 100644 index 000000000..aa7dd0e57 --- /dev/null +++ b/relapack/src/csytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's csytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html + * */ +void RELAPACK_csytrf_rook( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** csytrf_rook's recursive compute kernel */ +static void RELAPACK_csytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + float *const A_BL_B = A + 2 * *n; + float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + 2 * n1; + float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + 2 * *ldA * n_rest; + float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c new file mode 100644 index 000000000..6638338a6 --- /dev/null +++ b/relapack/src/csytrf_rook_rec2.c @@ -0,0 +1,565 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static complex c_b1 = {1.f,0.f}; +static int c__1 = 1; + +/** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's clasyf_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, complex *a, int *lda, int *ipiv, + complex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + float r__1, r__2; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + double sqrt(double), r_imag(complex *); + void c_div(complex *, complex *, complex *); + + /* Local variables */ + static int j, k, p; + static complex t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static float alpha; + extern /* Subroutine */ int cscal_(int *, complex *, complex *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen); + static float sfmin; + extern /* Subroutine */ int ccopy_(int *, complex *, int *, + complex *, int *); + static int itemp; + extern /* Subroutine */ int cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; + static float stemp, absakk; + extern int icamax_(int *, complex *, int *); + extern double slamch_(char *, ftnlen); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + sfmin = slamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + i__1 = k + kw * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * + w_dim1]), dabs(r__2)); + if (k > 1) { + i__1 = k - 1; + imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + kw * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha * + rowmax)) { + kp = imax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = k - 1 - kp; + ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k + + k * a_dim1]), dabs(r__2)) >= sfmin) { + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = k - 1; + cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0.f || a[i__1].i != 0.f) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L14: */ + } + } + } + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d12.r = w[i__1].r, d12.i = w[i__1].i; + c_div(&q__1, &w[k + kw * w_dim1], &d12); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d12); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d12); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + i__1 = k + k * w_dim1; + absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * + w_dim1]), dabs(r__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax + + k * w_dim1]), dabs(r__2)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + q__1.r = -1.f, q__1.i = -0.f; + cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[jmax + (k + 1) * w_dim1]), dabs(r__2)); + } else { + rowmax = 0.f; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& + w[itemp + (k + 1) * w_dim1]), dabs(r__2)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ + imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha * + rowmax)) { + kp = imax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - k - 1; + ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k + + k * a_dim1]), dabs(r__2)) >= sfmin) { + c_div(&q__1, &c_b1, &a[k + k * a_dim1]); + r1.r = q__1.r, r1.i = q__1.i; + i__1 = *n - k; + cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0.f || a[i__1].i != 0.f) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L74: */ + } + } + } + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = q__1.r, d11.i = q__1.i; + c_div(&q__1, &w[k + k * w_dim1], &d21); + d22.r = q__1.r, d22.i = q__1.i; + q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * + d22.i + d11.i * d22.r; + q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; + c_div(&q__1, &c_b1, &q__2); + t.r = q__1.r, t.i = q__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] + .i; + c_div(&q__2, &q__3, &d21); + q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * + q__2.i + t.i * q__2.r; + a[i__2].r = q__1.r, a[i__2].i = q__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c new file mode 100644 index 000000000..15c738baf --- /dev/null +++ b/relapack/src/ctgsyl.c @@ -0,0 +1,268 @@ +#include "relapack.h" +#include <math.h> + +static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, + const int *, const float *, const int *, const float *, const int *, + float *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, float *, float *, int *); + + +/** CTGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's ctgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html + * */ +void RELAPACK_ctgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "C"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CTGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'C'; + + // Constant + const float ZERO[] = { 0., 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + float scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + float dscale = 0; + float dsum = 1; + RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(clacpy)("F", m, n, C, ldC, Work, m); + LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); + LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(clacpy)("F", m, n, Work, m, C, ldC); + LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** ctgsyl's recursive vompute kernel */ +static void RELAPACK_ctgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dsum, float *dscale, + int *info +) { + + if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { + // Unblocked + LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1., 0. }; + float scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = CREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + 2 * *ldA * m1; + const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + 2 * m1; + + // D_TL D_TR + // 0 D_BR + const float *const D_TL = D; + const float *const D_TR = D + 2 * *ldD * m1; + const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1; + + // F_T + // F_B + float *const F_T = F; + float *const F_B = F + 2 * m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); + // C_T = C_T - A_TR * C_B + BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + 2 * *ldB * n1; + const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + 2 * *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const float *const E_TL = E; + const float *const E_TR = E + 2 * *ldE * n1; + const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1; + + // F_L F_R + float *const F_L = F; + float *const F_R = F + 2 * *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); + // C_R = C_R + F_L * B_TR + BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c new file mode 100644 index 000000000..b548d5354 --- /dev/null +++ b/relapack/src/ctrsyl.c @@ -0,0 +1,163 @@ +#include "relapack.h" + +static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, int *); + + +/** CTRSYL solves the complex Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's ctrsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html + * */ +void RELAPACK_ctrsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!ctransA && !notransA) + *info = -1; + else if (!ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CTRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : 'C'; + const char cleantranB = notransB ? 'N' : 'C'; + + // Recursive kernel + RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** ctrsyl's recursive compute kernel */ +static void RELAPACK_ctrsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { + // Unblocked + RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + const float MSGN[] = { -*isgn, 0. }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1., 0. }; + float scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = CREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + 2 * *ldA * m1; + const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + 2 * m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + 2 * *ldB * n1; + const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + 2 * *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c new file mode 100644 index 000000000..518574868 --- /dev/null +++ b/relapack/src/ctrsyl_rec2.c @@ -0,0 +1,392 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "../config.h" +#include "f2c.h" + +#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { + extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); + complex result; + cdotu_(&result, n, x, incx, y, incy); + return result; +} +#define cdotu_ cdotu_fun + +complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { + extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); + complex result; + cdotc_(&result, n, x, incx, y, incy); + return result; +} +#define cdotc_ cdotc_fun +#endif + +#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +complex cladiv_fun(complex *a, complex *b) { + extern void cladiv_(complex *, complex *, complex *); + complex result; + cladiv_(&result, a, b); + return result; +} +#define cladiv_ cladiv_fun +#endif + +/* Table of constant values */ + +static int c__1 = 1; + +/** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) + * + * This routine is an exact copy of LAPACK's ctrsyl. + * It serves as an unblocked kernel in the recursive algorithms. + * */ +/* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int + *isgn, int *m, int *n, complex *a, int *lda, complex *b, + int *ldb, complex *c__, int *ldc, float *scale, int *info, + ftnlen trana_len, ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + float r__1, r__2; + complex q__1, q__2, q__3, q__4; + + /* Builtin functions */ + float r_imag(complex *); + void r_cnjg(complex *, complex *); + + /* Local variables */ + static int j, k, l; + static complex a11; + static float db; + static complex x11; + static float da11; + static complex vec; + static float dum[1], eps, sgn, smin; + static complex suml, sumr; + /* Complex */ complex cdotc_(int *, complex *, int + *, complex *, int *); + extern int lsame_(char *, char *, ftnlen, ftnlen); + /* Complex */ complex cdotu_(int *, complex *, int + *, complex *, int *); + extern /* Subroutine */ int slabad_(float *, float *); + extern float clange_(char *, int *, int *, complex *, + int *, float *, ftnlen); + /* Complex */ complex cladiv_(complex *, complex *); + static float scaloc; + extern float slamch_(char *, ftnlen); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int + *), xerbla_(char *, int *, ftnlen); + static float bignum; + static int notrna, notrnb; + static float smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CTRSY2", &i__1, (ftnlen)6); + return; + } + *scale = 1.f; + if (*m == 0 || *n == 0) { + return; + } + eps = slamch_("P", (ftnlen)1); + smlnum = slamch_("S", (ftnlen)1); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + smlnum = smlnum * (float) (*m * *n) / eps; + bignum = 1.f / smlnum; +/* Computing MAX */ + r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = dmax(r__1,r__2); + sgn = (float) (*isgn); + if (notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + for (k = *m; k >= 1; --k) { + i__2 = *m - k; +/* Computing MIN */ + i__3 = k + 1; +/* Computing MIN */ + i__4 = k + 1; + q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ + min(i__4,*m) + l * c_dim1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__2 = l - 1; + q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = q__1.r, sumr.i = q__1.i; + i__2 = k + l * c_dim1; + q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i; + q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i; + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L20: */ + } +/* L30: */ + } + } else if (! notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + i__3 = k - 1; + q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__3 = l - 1; + q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = q__1.r, sumr.i = q__1.i; + i__3 = k + l * c_dim1; + q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + r_cnjg(&q__2, &a[k + k * a_dim1]); + i__3 = l + l * b_dim1; + q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; + q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + i__3 = k + l * c_dim1; + c__[i__3].r = x11.r, c__[i__3].i = x11.i; +/* L50: */ + } +/* L60: */ + } + } else if (! notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = k - 1; + q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__2 = *n - l; +/* Computing MIN */ + i__3 = l + 1; +/* Computing MIN */ + i__4 = l + 1; + q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ + l + min(i__4,*n) * b_dim1], ldb); + sumr.r = q__1.r, sumr.i = q__1.i; + i__2 = k + l * c_dim1; + r_cnjg(&q__4, &sumr); + q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; + q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i; + r_cnjg(&q__1, &q__2); + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L70: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L80: */ + } +/* L90: */ + } + } else if (notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + for (k = *m; k >= 1; --k) { + i__1 = *m - k; +/* Computing MIN */ + i__2 = k + 1; +/* Computing MIN */ + i__3 = k + 1; + q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ + min(i__3,*m) + l * c_dim1], &c__1); + suml.r = q__1.r, suml.i = q__1.i; + i__1 = *n - l; +/* Computing MIN */ + i__2 = l + 1; +/* Computing MIN */ + i__3 = l + 1; + q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ + l + min(i__3,*n) * b_dim1], ldb); + sumr.r = q__1.r, sumr.i = q__1.i; + i__1 = k + l * c_dim1; + r_cnjg(&q__4, &sumr); + q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; + q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; + q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i; + vec.r = q__1.r, vec.i = q__1.i; + scaloc = 1.f; + i__1 = k + k * a_dim1; + r_cnjg(&q__3, &b[l + l * b_dim1]); + q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i; + q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i; + a11.r = q__1.r, a11.i = q__1.i; + da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), + dabs(r__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.f; + da11 = smin; + *info = 1; + } + db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( + r__2)); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + q__3.r = scaloc, q__3.i = 0.f; + q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * + q__3.i + vec.i * q__3.r; + q__1 = cladiv_(&q__2, &a11); + x11.r = q__1.r, x11.i = q__1.i; + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + i__1 = k + l * c_dim1; + c__[i__1].r = x11.r, c__[i__1].i = x11.i; +/* L110: */ + } +/* L120: */ + } + } + return; +} diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c new file mode 100644 index 000000000..0262cb59d --- /dev/null +++ b/relapack/src/ctrtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, + float *, const int *, int *); + + +/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's ctrtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html + * */ +void RELAPACK_ctrtri( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("CTRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** ctrtri's recursive compute kernel */ +static void RELAPACK_ctrtri_rec( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { + // Unblocked + LAPACK(ctrti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1., 0. }; + const float MONE[] = { -1., 0. }; + + // Splitting + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + 2 * *ldA * n1; + float *const A_BL = A + 2 * n1; + float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c new file mode 100644 index 000000000..1a1757d31 --- /dev/null +++ b/relapack/src/dgbtrf.c @@ -0,0 +1,227 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, + const int *, double *, const int *, int *, double *, const int *, double *, + const int *, int *); + + +/** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's dgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html + * */ +void RELAPACK_dgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DGBTRF", &minfo); + return; + } + + // Constant + const double ZERO[] = { 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + double *const A_j = A + *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[i] = 0.; + } + + // Allocate work space + const int n1 = DREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); + double *Worku = malloc(mWorku * nWorku * sizeof(double)); + LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** dgbtrf's recursive compute kernel */ +static void RELAPACK_dgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { + // Unblocked + LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + kv; + + // Splitting + const int n1 = MIN(DREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + double *const Ab_L = Ab; + double *const Ab_BR = Ab + *ldAb * n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + m1; + double *const A_BR = A + *ldA * n1 + m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + double *const A_Rl = A_R; + double *const A_Rr = A_R + *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + m21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + *ldA * n21; + double *const A_BRbl = A_BR + m21; + double *const A_BRbr = A_BR + *ldA * n21 + m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + double *const A_Rrj = A_Rr + *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const double tmp = A_Rrj[i]; + A_Rrj[i] = A_Rr[ip]; + A_Rrj[ip] = tmp; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c new file mode 100644 index 000000000..9c925b586 --- /dev/null +++ b/relapack/src/dgemmt.c @@ -0,0 +1,165 @@ +#include "relapack.h" + +static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + +static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + + +/** DGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_dgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !notransA) + info = 2; + else if (!tranB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("DGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : 'T'; + const char cleantransB = notransB ? 'N' : 'T'; + + // Recursive kernel + RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** dgemmt's recursive compute kernel */ +static void RELAPACK_dgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { + // Unblocked + RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const double *const A_T = A; + const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const double *const B_L = B; + const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + double *const C_TL = C; + double *const C_TR = C + *ldC * n1; + double *const C_BL = C + n1; + double *const C_BR = C + *ldC * n1 + n1; + + // recursion(C_TL) + RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** dgemmt's unblocked compute kernel */ +static void RELAPACK_dgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const double *const A_0 = A; + const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + double *const C_0i = C + *ldC * i; + double *const C_ii = C + *ldC * i + i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c new file mode 100644 index 000000000..07f5472fd --- /dev/null +++ b/relapack/src/dgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_dgetrf_rec(const int *, const int *, double *, + const int *, int *, int *); + + +/** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's dgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html + * */ +void RELAPACK_dgetrf( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const double ONE[] = { 1. }; + const int iONE[] = { 1. }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const double *const A_L = A; + double *const A_R = A + *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_S \ A_R + BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** dgetrf's recursive compute kernel */ +static void RELAPACK_dgetrf_rec( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_DGETRF, 1)) { + // Unblocked + LAPACK(dgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c new file mode 100644 index 000000000..d722ea809 --- /dev/null +++ b/relapack/src/dlauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_dlauum_rec(const char *, const int *, double *, + const int *, int *); + + +/** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's dlauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html + * */ +void RELAPACK_dlauum( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** dlauum's recursive compute kernel */ +static void RELAPACK_dlauum_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { + // Unblocked + LAPACK(dlauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c new file mode 100644 index 000000000..6fd0ebe48 --- /dev/null +++ b/relapack/src/dpbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, + double *, const int *, double *, const int *, int *); + + +/** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's dpbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html + * */ +void RELAPACK_dpbtrf( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const double ZERO[] = { 0. }; + + // Allocate work space + const int n1 = DREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + double *Work = malloc(mWork * nWork * sizeof(double)); + LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** dpbtrf's recursive compute kernel */ +static void RELAPACK_dpbtrf_rec( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + double *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { + // Unblocked + LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(DREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + double *const Ab_BR = Ab + *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, n1); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + n21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + *ldA * n21; + double *const A_BRbl = A_BR + n21; + double *const A_BRbr = A_BR + *ldA * n21 + n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c new file mode 100644 index 000000000..c14fb3d71 --- /dev/null +++ b/relapack/src/dpotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_dpotrf_rec(const char *, const int *, double *, + const int *, int *); + + +/** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's dpotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html + * */ +void RELAPACK_dpotrf( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** dpotrf's recursive compute kernel */ +static void RELAPACK_dpotrf_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { + // Unblocked + LAPACK(dpotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c new file mode 100644 index 000000000..0228068ce --- /dev/null +++ b/relapack/src/dsygst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_dsygst_rec(const int *, const char *, const int *, + double *, const int *, const double *, const int *, + double *, const int *, int *); + + +/** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's dsygst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html + * */ +void RELAPACK_dsygst( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DSYGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + double *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = DREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * sizeof(double)); + if (!Work) + lWork = 0; +#endif + + // recursive kernel + RELAPACK_dsygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** dsygst's recursive compute kernel */ +static void RELAPACK_dsygst_rec( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + double *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_SSYGST, 1)) { + // Unblocked + LAPACK(dsygs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const double ZERO[] = { 0. }; + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const double HALF[] = { .5 }; + const double MHALF[] = { -.5 }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // B_TL B_TR + // B_BL B_BR + const double *const B_TL = B; + const double *const B_TR = B + *ldB * n1; + const double *const B_BL = B + n1; + const double *const B_BR = B + *ldB * n1 + n1; + + // recursion(A_TL, B_TL) + RELAPACK_dsygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(dsyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(dsymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(dtrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(dsyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(dsymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(dtrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(dtrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(dsyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(daxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(dsymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(dtrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(dsyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(daxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(dsymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_dsygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c new file mode 100644 index 000000000..80b119336 --- /dev/null +++ b/relapack/src/dsytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's dsytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html + * */ +void RELAPACK_dsytrf( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** dsytrf's recursive compute kernel */ +static void RELAPACK_dsytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_DSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_dsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = DREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + double *const A_BL_B = A + *n; + double *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + n1; + double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = DREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + *ldA * n_rest; + double *const A_TR_T = A + *ldA * (n_rest + n1); + double *const A_TL = A + *ldA * n_rest + n_rest; + double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c new file mode 100644 index 000000000..72ef827b1 --- /dev/null +++ b/relapack/src/dsytrf_rec2.c @@ -0,0 +1,352 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static double c_b8 = -1.; +static double c_b9 = 1.; + +/** DSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's dlasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int * + nb, int *kb, double *a, int *lda, int *ipiv, + double *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + double d__1, d__2, d__3; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k; + static double t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static double alpha; + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int dgemv_(char *, int *, int *, + double *, double *, int *, double *, int *, + double *, double *, int *, ftnlen), dcopy_(int *, + double *, int *, double *, int *), dswap_(int + *, double *, int *, double *, int *); + static int kstep; + static double absakk; + extern int idamax_(int *, double *, int *); + static double colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); + if (imax > 1) { + i__1 = imax - 1; + jmax = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + d__2 = rowmax, d__3 = (d__1 = w[jmax + (kw - 1) * w_dim1], + abs(d__1)); + rowmax = max(d__2,d__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) >= + alpha * rowmax) { + kp = imax; + dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kk - 1 - kp; + dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + dcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + dswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + r1 = 1. / a[k + k * a_dim1]; + i__1 = k - 1; + dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + d21 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d21; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; + t = 1. / (d11 * d22 - 1.); + d21 = t / d21; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) + * w_dim1] - w[j + kw * w_dim1]); + a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + dswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + d__2 = rowmax, d__3 = (d__1 = w[jmax + (k + 1) * w_dim1], + abs(d__1)); + rowmax = max(d__2,d__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) >= + alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k + kstep - 1; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kp - kk - 1; + dcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + dswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + r1 = 1. / a[k + k * a_dim1]; + i__1 = *n - k; + dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1. / (d11 * d22 - 1.); + d21 = t / d21; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - + w[j + (k + 1) * w_dim1]); + a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + dswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c new file mode 100644 index 000000000..19a875c7a --- /dev/null +++ b/relapack/src/dsytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's dsytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html + * */ +void RELAPACK_dsytrf_rook( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** dsytrf_rook's recursive compute kernel */ +static void RELAPACK_dsytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(dsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_dsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = DREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + double *const A_BL_B = A + *n; + double *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + n1; + double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(dgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = DREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + *ldA * n_rest; + double *const A_TR_T = A + *ldA * (n_rest + n1); + double *const A_TL = A + *ldA * n_rest + n_rest; + double *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c new file mode 100644 index 000000000..105ef5ed3 --- /dev/null +++ b/relapack/src/dsytrf_rook_rec2.c @@ -0,0 +1,451 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static double c_b9 = -1.; +static double c_b10 = 1.; + +/** DSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's dlasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, double *a, int *lda, int *ipiv, + double *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + double d__1; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k, p; + static double t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static double alpha; + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int dgemv_(char *, int *, int *, + double *, double *, int *, double *, int *, + double *, double *, int *, ftnlen); + static double dtemp, sfmin; + static int itemp; + extern /* Subroutine */ int dcopy_(int *, double *, int *, + double *, int *), dswap_(int *, double *, int + *, double *, int *); + static int kstep; + extern double dlamch_(char *, ftnlen); + static double absakk; + extern int idamax_(int *, double *, int *); + static double colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + sfmin = dlamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + dcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + absakk = (d__1 = w[k + kw * w_dim1], abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = idamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (d__1 = w[imax + kw * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + dcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + dcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + dgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + idamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + rowmax = (d__1 = w[jmax + (kw - 1) * w_dim1], abs(d__1)); + } else { + rowmax = 0.; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = idamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + dtemp = (d__1 = w[itemp + (kw - 1) * w_dim1], abs(d__1)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + if (! ((d__1 = w[imax + (kw - 1) * w_dim1], abs(d__1)) < + alpha * rowmax)) { + kp = imax; + dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + dcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + dcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + dcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + dswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + dswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = k - 1 - kp; + dcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + dcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + dswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + dswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + dcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { + r1 = 1. / a[k + k * a_dim1]; + i__1 = k - 1; + dscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else if (a[k + k * a_dim1] != 0.) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L14: */ + } + } + } + } else { + if (k > 2) { + d12 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d12; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; + t = 1. / (d11 * d22 - 1.); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * + w_dim1] - w[j + kw * w_dim1]) / d12); + a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]) / d12); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + dswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + dswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + absakk = (d__1 = w[k + k * w_dim1], abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + idamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (d__1 = w[imax + k * w_dim1], abs(d__1)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + dcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + dcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + dgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + idamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + rowmax = (d__1 = w[jmax + (k + 1) * w_dim1], abs(d__1)); + } else { + rowmax = 0.; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + idamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + dtemp = (d__1 = w[itemp + (k + 1) * w_dim1], abs(d__1)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + if (! ((d__1 = w[imax + (k + 1) * w_dim1], abs(d__1)) < alpha + * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + dcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + dcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + dswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + dswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = kp - k - 1; + dcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + dcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + dswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + dswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + dcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + if ((d__1 = a[k + k * a_dim1], abs(d__1)) >= sfmin) { + r1 = 1. / a[k + k * a_dim1]; + i__1 = *n - k; + dscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else if (a[k + k * a_dim1] != 0.) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L74: */ + } + } + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1. / (d11 * d22 - 1.); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ + j + (k + 1) * w_dim1]) / d21); + a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]) / d21); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + dswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + dswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c new file mode 100644 index 000000000..c506926af --- /dev/null +++ b/relapack/src/dtgsyl.c @@ -0,0 +1,274 @@ +#include "relapack.h" +#include <math.h> + +static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, + const int *, const double *, const int *, const double *, const int *, + double *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, double *, double *, int *, + int *, int *); + + +/** DTGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's dtgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html + * */ +void RELAPACK_dtgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "T"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DTGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'T'; + + // Constant + const double ZERO[] = { 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + double scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + double dscale = 0; + double dsum = 1; + int pq; + RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(pq) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(dlacpy)("F", m, n, C, ldC, Work, m); + LAPACK(dlacpy)("F", m, n, F, ldF, Work + *m * *n, m); + LAPACK(dlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(dlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(dlacpy)("F", m, n, Work, m, C, ldC); + LAPACK(dlacpy)("F", m, n, Work + *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** dtgsyl's recursive vompute kernel */ +static void RELAPACK_dtgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dsum, double *dscale, + int *iWork, int *pq, int *info +) { + + if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) { + // Unblocked + LAPACK(dtgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1. }; + double scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = DREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + *ldA * m1; + const double *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + m1; + + // D_TL D_TR + // 0 D_BR + const double *const D_TL = D; + const double *const D_TR = D + *ldD * m1; + const double *const D_BR = D + *ldD * m1 + m1; + + // F_T + // F_B + double *const F_T = F; + double *const F_B = F + m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_T = C_T - A_TR * C_B + BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_dtgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(dgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_dtgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + int n1 = DREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + *ldB * n1; + const double *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const double *const E_TL = E; + const double *const E_TR = E + *ldE * n1; + const double *const E_BR = E + *ldE * n1 + n1; + + // F_L F_R + double *const F_L = F; + double *const F_R = F + *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_R = C_R + F_L * B_TR + BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(dgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(dgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_dtgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c new file mode 100644 index 000000000..c87b53ae5 --- /dev/null +++ b/relapack/src/dtrsyl.c @@ -0,0 +1,169 @@ +#include "relapack.h" + +static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, int *); + + +/** DTRSYL solves the real Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's dtrsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html + * */ +void RELAPACK_dtrsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int transA = LAPACK(lsame)(tranA, "T"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int transB = LAPACK(lsame)(tranB, "T"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!transA && !ctransA && !notransA) + *info = -1; + else if (!transB && !ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DTRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); + const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_dtrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** dtrsyl's recursive compute kernel */ +static void RELAPACK_dtrsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) { + // Unblocked + RELAPACK_dtrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + const double MSGN[] = { -*isgn }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1. }; + double scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = DREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + *ldA * m1; + const double *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(dgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(dgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + int n1 = DREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + *ldB * n1; + const double *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(dgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(dgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_dtrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(dlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c new file mode 100644 index 000000000..479c7f340 --- /dev/null +++ b/relapack/src/dtrsyl_rec2.c @@ -0,0 +1,1034 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static int c_false = FALSE_; +static int c__2 = 2; +static double c_b26 = 1.; +static double c_b30 = 0.; +static int c_true = TRUE_; + +int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int + *m, int *n, double *a, int *lda, double *b, int * + ldb, double *c__, int *ldc, double *scale, int *info, + ftnlen trana_len, ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + double d__1, d__2; + + /* Local variables */ + static int j, k, l; + static double x[4] /* was [2][2] */; + static int k1, k2, l1, l2; + static double a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, + sgn; + extern double ddot_(int *, double *, int *, double *, + int *); + static int ierr; + static double smin, suml, sumr; + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); + extern int lsame_(char *, char *, ftnlen, ftnlen); + static int knext, lnext; + static double xnorm; + extern /* Subroutine */ int dlaln2_(int *, int *, int *, + double *, double *, double *, int *, double *, + double *, double *, int *, double *, double * + , double *, int *, double *, double *, int *), + dlasy2_(int *, int *, int *, int *, int *, + double *, int *, double *, int *, double *, + int *, double *, double *, int *, double *, + int *), dlabad_(double *, double *); + extern double dlamch_(char *, ftnlen), dlange_(char *, int *, + int *, double *, int *, double *, ftnlen); + static double scaloc; + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + static double bignum; + static int notrna, notrnb; + static double smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_( + trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && ! + lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("DTRSYL", &i__1, (ftnlen)6); + return 0; + } + *scale = 1.; + if (*m == 0 || *n == 0) { + return 0; + } + eps = dlamch_("P", (ftnlen)1); + smlnum = dlamch_("S", (ftnlen)1); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + smlnum = smlnum * (double) (*m * *n) / eps; + bignum = 1. / smlnum; +/* Computing MAX */ + d__1 = smlnum, d__2 = eps * dlange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * dlange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = max(d__1,d__2); + sgn = (double) (*isgn); + if (notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L60; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L50; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L20: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L30: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = ddot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = ddot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 + + k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, + &c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L50: + ; + } +L60: + ; + } + } else if (! notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L120; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = 1; + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + if (k < knext) { + goto L110; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L70: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L80: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L90: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = ddot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = ddot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L110: + ; + } +L120: + ; + } + } else if (! notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L180; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = 1; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + if (k < knext) { + goto L170; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l1; +/* Computing MIN */ + i__3 = l1 + 1; +/* Computing MIN */ + i__4 = l1 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L130: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L140: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L150: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = ddot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = ddot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 * + a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L160: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L170: + ; + } +L180: + ; + } + } else if (notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L240; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L230; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l1; +/* Computing MIN */ + i__2 = l1 + 1; +/* Computing MIN */ + i__3 = l1 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = abs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = abs(vec[0]); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L190: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + d__1 = -sgn * b[l1 + l1 * b_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L200: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + d__1 = -sgn * a[k1 + k1 * a_dim1]; + dlaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &d__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L210: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = ddot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = ddot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + dlasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + dscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L220: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L230: + ; + } +L240: + ; + } + } + return 0; +} diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c new file mode 100644 index 000000000..0462609e9 --- /dev/null +++ b/relapack/src/dtrtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, + double *, const int *, int *); + + +/** DTRTRI computes the inverse of a real upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's dtrtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html + * */ +void RELAPACK_dtrtri( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DTRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[i + *ldA * i] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_dtrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** dtrtri's recursive compute kernel */ +static void RELAPACK_dtrtri_rec( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_DTRTRI, 1)) { + // Unblocked + LAPACK(dtrti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Splitting + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + *ldA * n1; + double *const A_BL = A + n1; + double *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_dtrtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(dtrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(dtrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(dtrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(dtrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_dtrtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c new file mode 100644 index 000000000..5a3452419 --- /dev/null +++ b/relapack/src/f2c.c @@ -0,0 +1,109 @@ +#include "stdlib.h" +#include "stdio.h" +#include "signal.h" +#include "f2c.h" + +#ifndef SIGIOT +#ifdef SIGABRT +#define SIGIOT SIGABRT +#endif +#endif + +void sig_die(const char *s, int kill) { + /* print error message, then clear buffers */ + fprintf(stderr, "%s\n", s); + + if(kill) { + fflush(stderr); + /* now get a core */ + signal(SIGIOT, SIG_DFL); + abort(); + } else + exit(1); +} + +void c_div(complex *c, complex *a, complex *b) { + double ratio, den; + double abr, abi, cr; + + if( (abr = b->r) < 0.) + abr = - abr; + if( (abi = b->i) < 0.) + abi = - abi; + if( abr <= abi ) { + if(abi == 0) { +#ifdef IEEE_COMPLEX_DIVIDE + float af, bf; + af = bf = abr; + if (a->i != 0 || a->r != 0) + af = 1.; + c->i = c->r = af / bf; + return; +#else + sig_die("complex division by zero", 1); +#endif + } + ratio = (double)b->r / b->i ; + den = b->i * (1 + ratio*ratio); + cr = (a->r*ratio + a->i) / den; + c->i = (a->i*ratio - a->r) / den; + } else { + ratio = (double)b->i / b->r ; + den = b->r * (1 + ratio*ratio); + cr = (a->r + a->i*ratio) / den; + c->i = (a->i - a->r*ratio) / den; + } + c->r = cr; +} + +void z_div(doublecomplex *c, doublecomplex *a, doublecomplex *b) { + double ratio, den; + double abr, abi, cr; + + if( (abr = b->r) < 0.) + abr = - abr; + if( (abi = b->i) < 0.) + abi = - abi; + if( abr <= abi ) { + if(abi == 0) { +#ifdef IEEE_COMPLEX_DIVIDE + if (a->i != 0 || a->r != 0) + abi = 1.; + c->i = c->r = abi / abr; + return; +#else + sig_die("complex division by zero", 1); +#endif + } + ratio = b->r / b->i ; + den = b->i * (1 + ratio*ratio); + cr = (a->r*ratio + a->i) / den; + c->i = (a->i*ratio - a->r) / den; + } else { + ratio = b->i / b->r ; + den = b->r * (1 + ratio*ratio); + cr = (a->r + a->i*ratio) / den; + c->i = (a->i - a->r*ratio) / den; + } + c->r = cr; +} + +float r_imag(complex *z) { + return z->i; +} + +void r_cnjg(complex *r, complex *z) { + float zi = z->i; + r->r = z->r; + r->i = -zi; +} + +double d_imag(doublecomplex *z) { + return z->i; +} + +void d_cnjg(doublecomplex *r, doublecomplex *z) { + double zi = z->i; + r->r = z->r; + r->i = -zi; +} diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h new file mode 100644 index 000000000..b94ee7c8e --- /dev/null +++ b/relapack/src/f2c.h @@ -0,0 +1,223 @@ +/* f2c.h -- Standard Fortran to C header file */ + +/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." + + - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */ + +#ifndef F2C_INCLUDE +#define F2C_INCLUDE + +typedef long int integer; +typedef unsigned long int uinteger; +typedef char *address; +typedef short int shortint; +typedef float real; +typedef double doublereal; +typedef struct { real r, i; } complex; +typedef struct { doublereal r, i; } doublecomplex; +typedef long int logical; +typedef short int shortlogical; +typedef char logical1; +typedef char integer1; +#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */ +typedef long long longint; /* system-dependent */ +typedef unsigned long long ulongint; /* system-dependent */ +#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) +#define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) +#endif + +#define TRUE_ (1) +#define FALSE_ (0) + +/* Extern is for use with -E */ +#ifndef Extern +#define Extern extern +#endif + +/* I/O stuff */ + +#ifdef f2c_i2 +/* for -i2 */ +typedef short flag; +typedef short ftnlen; +typedef short ftnint; +#else +typedef long int flag; +typedef long int ftnlen; +typedef long int ftnint; +#endif + +/*external read, write*/ +typedef struct +{ flag cierr; + ftnint ciunit; + flag ciend; + char *cifmt; + ftnint cirec; +} cilist; + +/*internal read, write*/ +typedef struct +{ flag icierr; + char *iciunit; + flag iciend; + char *icifmt; + ftnint icirlen; + ftnint icirnum; +} icilist; + +/*open*/ +typedef struct +{ flag oerr; + ftnint ounit; + char *ofnm; + ftnlen ofnmlen; + char *osta; + char *oacc; + char *ofm; + ftnint orl; + char *oblnk; +} olist; + +/*close*/ +typedef struct +{ flag cerr; + ftnint cunit; + char *csta; +} cllist; + +/*rewind, backspace, endfile*/ +typedef struct +{ flag aerr; + ftnint aunit; +} alist; + +/* inquire */ +typedef struct +{ flag inerr; + ftnint inunit; + char *infile; + ftnlen infilen; + ftnint *inex; /*parameters in standard's order*/ + ftnint *inopen; + ftnint *innum; + ftnint *innamed; + char *inname; + ftnlen innamlen; + char *inacc; + ftnlen inacclen; + char *inseq; + ftnlen inseqlen; + char *indir; + ftnlen indirlen; + char *infmt; + ftnlen infmtlen; + char *inform; + ftnint informlen; + char *inunf; + ftnlen inunflen; + ftnint *inrecl; + ftnint *innrec; + char *inblank; + ftnlen inblanklen; +} inlist; + +#define VOID void + +union Multitype { /* for multiple entry points */ + integer1 g; + shortint h; + integer i; + /* longint j; */ + real r; + doublereal d; + complex c; + doublecomplex z; + }; + +typedef union Multitype Multitype; + +/*typedef long int Long;*/ /* No longer used; formerly in Namelist */ + +struct Vardesc { /* for Namelist */ + char *name; + char *addr; + ftnlen *dims; + int type; + }; +typedef struct Vardesc Vardesc; + +struct Namelist { + char *name; + Vardesc **vars; + int nvars; + }; +typedef struct Namelist Namelist; + +#define abs(x) ((x) >= 0 ? (x) : -(x)) +#define dabs(x) (doublereal)abs(x) +#define min(a,b) ((a) <= (b) ? (a) : (b)) +#define max(a,b) ((a) >= (b) ? (a) : (b)) +#define dmin(a,b) (doublereal)min(a,b) +#define dmax(a,b) (doublereal)max(a,b) +#define bit_test(a,b) ((a) >> (b) & 1) +#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) +#define bit_set(a,b) ((a) | ((uinteger)1 << (b))) + +/* procedure parameter types for -A and -C++ */ + +#define F2C_proc_par_types 1 +#ifdef __cplusplus +typedef int /* Unknown procedure type */ (*U_fp)(...); +typedef shortint (*J_fp)(...); +typedef integer (*I_fp)(...); +typedef real (*R_fp)(...); +typedef doublereal (*D_fp)(...), (*E_fp)(...); +typedef /* Complex */ VOID (*C_fp)(...); +typedef /* Double Complex */ VOID (*Z_fp)(...); +typedef logical (*L_fp)(...); +typedef shortlogical (*K_fp)(...); +typedef /* Character */ VOID (*H_fp)(...); +typedef /* Subroutine */ int (*S_fp)(...); +#else +typedef int /* Unknown procedure type */ (*U_fp)(); +typedef shortint (*J_fp)(); +typedef integer (*I_fp)(); +typedef real (*R_fp)(); +typedef doublereal (*D_fp)(), (*E_fp)(); +typedef /* Complex */ VOID (*C_fp)(); +typedef /* Double Complex */ VOID (*Z_fp)(); +typedef logical (*L_fp)(); +typedef shortlogical (*K_fp)(); +typedef /* Character */ VOID (*H_fp)(); +typedef /* Subroutine */ int (*S_fp)(); +#endif +/* E_fp is for real functions when -R is not specified */ +typedef VOID C_f; /* complex function */ +typedef VOID H_f; /* character function */ +typedef VOID Z_f; /* double complex function */ +typedef doublereal E_f; /* real function with -R not specified */ + +/* undef any lower-case symbols that your C compiler predefines, e.g.: */ + +#ifndef Skip_f2c_Undefs +#undef cray +#undef gcos +#undef mc68010 +#undef mc68020 +#undef mips +#undef pdp11 +#undef sgi +#undef sparc +#undef sun +#undef sun2 +#undef sun3 +#undef sun4 +#undef u370 +#undef u3b +#undef u3b2 +#undef u3b5 +#undef unix +#undef vax +#endif +#endif diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h new file mode 100644 index 000000000..064276b7e --- /dev/null +++ b/relapack/src/lapack.h @@ -0,0 +1,80 @@ +#ifndef LAPACK_H +#define LAPACK_H + +extern int LAPACK(lsame)(const char *, const char *); +extern int LAPACK(xerbla)(const char *, const int *); + +extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); + +extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); +extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); +extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); +extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); + +extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); +extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); +extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); +extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); + +extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); + +extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *); + +extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); + +extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *); +extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *); +extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *); +extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *); + +extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *); + +extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *); + +extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *); + +extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *); + +extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); + +extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *); +extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *); +extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *); +extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *); + +#endif /* LAPACK_H */ diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c new file mode 100644 index 000000000..488547260 --- /dev/null +++ b/relapack/src/lapack_wrappers.c @@ -0,0 +1,607 @@ +#include "relapack.h" + +//////////// +// XLAUUM // +//////////// + +#if INCLUDE_SLAUUM +void LAPACK(slauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_slauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DLAUUM +void LAPACK(dlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dlauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CLAUUM +void LAPACK(clauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_clauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZLAUUM +void LAPACK(zlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zlauum(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XSYGST // +//////////// + +#if INCLUDE_SSYGST +void LAPACK(ssygst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_DSYGST +void LAPACK(dsygst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_CHEGST +void LAPACK(chegst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_ZHEGST +void LAPACK(zhegst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + + +//////////// +// XTRTRI // +//////////// + +#if INCLUDE_STRTRI +void LAPACK(strtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_strtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_DTRTRI +void LAPACK(dtrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_CTRTRI +void LAPACK(ctrtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZTRTRI +void LAPACK(ztrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); +} +#endif + + +//////////// +// XPOTRF // +//////////// + +#if INCLUDE_SPOTRF +void LAPACK(spotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_spotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DPOTRF +void LAPACK(dpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CPOTRF +void LAPACK(cpotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_cpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZPOTRF +void LAPACK(zpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zpotrf(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XPBTRF // +//////////// + +#if INCLUDE_SPBTRF +void LAPACK(spbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_DPBTRF +void LAPACK(dpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_CPBTRF +void LAPACK(cpbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_ZPBTRF +void LAPACK(zpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + + +//////////// +// XSYTRF // +//////////// + +#if INCLUDE_SSYTRF +void LAPACK(ssytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF +void LAPACK(dsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF +void LAPACK(csytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF +void LAPACK(zsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF +void LAPACK(chetrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF +void LAPACK(zhetrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_SSYTRF_ROOK +void LAPACK(ssytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF_ROOK +void LAPACK(dsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF_ROOK +void LAPACK(csytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF_ROOK +void LAPACK(zsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF_ROOK +void LAPACK(chetrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF_ROOK +void LAPACK(zhetrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + + +//////////// +// XGETRF // +//////////// + +#if INCLUDE_SGETRF +void LAPACK(sgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_DGETRF +void LAPACK(dgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_CGETRF +void LAPACK(cgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_ZGETRF +void LAPACK(zgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); +} +#endif + + +//////////// +// XGBTRF // +//////////// + +#if INCLUDE_SGBTRF +void LAPACK(sgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_DGBTRF +void LAPACK(dgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_CGBTRF +void LAPACK(cgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_ZGBTRF +void LAPACK(zgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + + +//////////// +// XTRSYL // +//////////// + +#if INCLUDE_STRSYL +void LAPACK(strsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_DTRSYL +void LAPACK(dtrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_CTRSYL +void LAPACK(ctrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_ZTRSYL +void LAPACK(ztrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + + +//////////// +// XTGSYL // +//////////// + +#if INCLUDE_STGSYL +void LAPACK(stgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_DTGSYL +void LAPACK(dtgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_CTGSYL +void LAPACK(ctgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_ZTGSYL +void LAPACK(ztgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + + +//////////// +// XGEMMT // +//////////// + +#if INCLUDE_SGEMMT +void LAPACK(sgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_sgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DGEMMT +void LAPACK(dgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_dgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CGEMMT +void LAPACK(cgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_cgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZGEMMT +void LAPACK(zgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_zgemmt(uplo, n, A, ldA, info); +} +#endif diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig new file mode 100644 index 000000000..d89d2fe2f --- /dev/null +++ b/relapack/src/lapack_wrappers.c.orig @@ -0,0 +1,607 @@ +#include "relapack.h" + +//////////// +// XLAUUM // +//////////// + +#if INCLUDE_SLAUUM +void LAPACK(slauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_slauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DLAUUM +void LAPACK(dlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dlauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CLAUUM +void LAPACK(clauum)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_clauum(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZLAUUM +void LAPACK(zlauum)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zlauum(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XSYGST // +//////////// + +#if INCLUDE_SSYGST +void LAPACK(ssygst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_DSYGST +void LAPACK(dsygst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_CSYGST +void LAPACK(csygst)( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + +#if INCLUDE_ZSYGST +void LAPACK(zsygst)( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); +} +#endif + + +//////////// +// XTRTRI // +//////////// + +#if INCLUDE_STRTRI +void LAPACK(strtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_strtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_DTRTRI +void LAPACK(dtrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_CTRTRI +void LAPACK(ctrtri)( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZTRTRI +void LAPACK(ztrtri)( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); +} +#endif + + +//////////// +// XPOTRF // +//////////// + +#if INCLUDE_SPOTRF +void LAPACK(spotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_spotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DPOTRF +void LAPACK(dpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_dpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CPOTRF +void LAPACK(cpotrf)( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + RELAPACK_cpotrf(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZPOTRF +void LAPACK(zpotrf)( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + RELAPACK_zpotrf(uplo, n, A, ldA, info); +} +#endif + + +//////////// +// XPBTRF // +//////////// + +#if INCLUDE_SPBTRF +void LAPACK(spbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_DPBTRF +void LAPACK(dpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_CPBTRF +void LAPACK(cpbtrf)( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + +#if INCLUDE_ZPBTRF +void LAPACK(zpbtrf)( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); +} +#endif + + +//////////// +// XSYTRF // +//////////// + +#if INCLUDE_SSYTRF +void LAPACK(ssytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF +void LAPACK(dsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF +void LAPACK(csytrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF +void LAPACK(zsytrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF +void LAPACK(chetrf)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF +void LAPACK(zhetrf)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_SSYTRF_ROOK +void LAPACK(ssytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_DSYTRF_ROOK +void LAPACK(dsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CSYTRF_ROOK +void LAPACK(csytrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZSYTRF_ROOK +void LAPACK(zsytrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_CHETRF_ROOK +void LAPACK(chetrf_rook)( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + +#if INCLUDE_ZHETRF_ROOK +void LAPACK(zhetrf_rook)( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); +} +#endif + + +//////////// +// XGETRF // +//////////// + +#if INCLUDE_SGETRF +void LAPACK(sgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_DGETRF +void LAPACK(dgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_CGETRF +void LAPACK(cgetrf)( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); +} +#endif + +#if INCLUDE_ZGETRF +void LAPACK(zgetrf)( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); +} +#endif + + +//////////// +// XGBTRF // +//////////// + +#if INCLUDE_SGBTRF +void LAPACK(sgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_DGBTRF +void LAPACK(dgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_CGBTRF +void LAPACK(cgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + +#if INCLUDE_ZGBTRF +void LAPACK(zgbtrf)( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); +} +#endif + + +//////////// +// XTRSYL // +//////////// + +#if INCLUDE_STRSYL +void LAPACK(strsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_DTRSYL +void LAPACK(dtrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_CTRSYL +void LAPACK(ctrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + +#if INCLUDE_ZTRSYL +void LAPACK(ztrsyl)( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} +#endif + + +//////////// +// XTGSYL // +//////////// + +#if INCLUDE_STGSYL +void LAPACK(stgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_DTGSYL +void LAPACK(dtgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_CTGSYL +void LAPACK(ctgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + +#if INCLUDE_ZTGSYL +void LAPACK(ztgsyl)( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); +} +#endif + + +//////////// +// XGEMMT // +//////////// + +#if INCLUDE_SGEMMT +void LAPACK(sgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_sgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_DGEMMT +void LAPACK(dgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_dgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_CGEMMT +void LAPACK(cgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + RELAPACK_cgemmt(uplo, n, A, ldA, info); +} +#endif + +#if INCLUDE_ZGEMMT +void LAPACK(zgemmt)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + RELAPACK_zgemmt(uplo, n, A, ldA, info); +} +#endif diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h new file mode 100644 index 000000000..2cb061c32 --- /dev/null +++ b/relapack/src/relapack.h @@ -0,0 +1,60 @@ +#ifndef RELAPACK_INT_H +#define RELAPACK_INT_H + +#include "../config.h" + +#include "../inc/relapack.h" + +// add an underscore to BLAS routines (or not) +#if BLAS_UNDERSCORE +#define BLAS(routine) routine ## _ +#else +#define BLAS(routine) routine +#endif + +// add an underscore to LAPACK routines (or not) +#if LAPACK_UNDERSCORE +#define LAPACK(routine) routine ## _ +#else +#define LAPACK(routine) routine +#endif + +// minimum and maximum macros +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +// REC_SPLIT(n) returns how a problem of size n is split recursively. +// If n >= 16, we ensure that the size of at least one of the halves is +// divisible by 8 (the cache line size in most CPUs), while both halves are +// still as close as possible in size. +// If n < 16 the problem is simply split in the middle. (Note that the +// crossoversize is usually larger than 16.) +#define SREC_SPLIT(n) ((n >= 32) ? ((n + 16) / 32) * 16 : n / 2) +#define DREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) +#define CREC_SPLIT(n) ((n >= 16) ? ((n + 8) / 16) * 8 : n / 2) +#define ZREC_SPLIT(n) ((n >= 8) ? ((n + 4) / 8) * 4 : n / 2) + +#include "lapack.h" +#include "blas.h" + +// sytrf helper routines +void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); + +// trsyl helper routines +void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); + +#endif /* RELAPACK_INT_H */ diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c new file mode 100644 index 000000000..bc20e744b --- /dev/null +++ b/relapack/src/sgbtrf.c @@ -0,0 +1,227 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, + const int *, float *, const int *, int *, float *, const int *, float *, + const int *, int *); + + +/** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's sgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html + * */ +void RELAPACK_sgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SGBTRF", &minfo); + return; + } + + // Constant + const float ZERO[] = { 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskewg A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + float *const A_j = A + *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[i] = 0.; + } + + // Allocate work space + const int n1 = SREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); + float *Worku = malloc(mWorku * nWorku * sizeof(float)); + LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** sgbtrf's recursive compute kernel */ +static void RELAPACK_sgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { + // Unblocked + LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + kv; + + // Splitting + const int n1 = MIN(SREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + float *const Ab_L = Ab; + float *const Ab_BR = Ab + *ldAb * n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + m1; + float *const A_BR = A + *ldA * n1 + m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + float *const A_Rl = A_R; + float *const A_Rr = A_R + *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + m21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + *ldA * n21; + float *const A_BRbl = A_BR + m21; + float *const A_BRbr = A_BR + *ldA * n21 + m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(slaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + float *const A_Rrj = A_Rr + *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const float tmp = A_Rrj[i]; + A_Rrj[i] = A_Rr[ip]; + A_Rrj[ip] = tmp; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(strsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(slacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(strsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(slacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(sgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(sgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(sgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(sgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); + else + BLAS(sswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c new file mode 100644 index 000000000..75f78fabd --- /dev/null +++ b/relapack/src/sgemmt.c @@ -0,0 +1,165 @@ +#include "relapack.h" + +static void RELAPACK_sgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + +static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); + + +/** SGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * sgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_sgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(sgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !notransA) + info = 2; + else if (!tranB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("SGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : 'T'; + const char cleantransB = notransB ? 'N' : 'T'; + + // Recursive kernel + RELAPACK_sgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** sgemmt's recursive compute kernel */ +static void RELAPACK_sgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_SGEMMT, 1)) { + // Unblocked + RELAPACK_sgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const float *const A_T = A; + const float *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const float *const B_L = B; + const float *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + float *const C_TL = C; + float *const C_TR = C + *ldC * n1; + float *const C_BL = C + n1; + float *const C_BR = C + *ldC * n1 + n1; + + // recursion(C_TL) + RELAPACK_sgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(sgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(sgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_sgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** sgemmt's unblocked compute kernel */ +static void RELAPACK_sgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const float *const A_0 = A; + const float *const A_i = A + ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const float *const B_i = B + ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + float *const C_0i = C + *ldC * i; + float *const C_ii = C + *ldC * i + i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(sgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c new file mode 100644 index 000000000..284f8cff6 --- /dev/null +++ b/relapack/src/sgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, + int *, int *); + + +/** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's sgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html + * */ +void RELAPACK_sgetrf( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const float ONE[] = { 1. }; + const int iONE[] = { 1. }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const float *const A_L = A; + float *const A_R = A + *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(slaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_L \ A_R + BLAS(strsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** sgetrf's recursive compute kernel */ +static void RELAPACK_sgetrf_rec( + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_SGETRF, 1)) { + // Unblocked + LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + float *const A_L = A; + float *const A_R = A + *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(strsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(sgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_sgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c new file mode 100644 index 000000000..280f141b3 --- /dev/null +++ b/relapack/src/slauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_slauum_rec(const char *, const int *, float *, + const int *, int *); + + +/** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's slauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html + * */ +void RELAPACK_slauum( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_slauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** slauum's recursive compute kernel */ +static void RELAPACK_slauum_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_SLAUUM, 1)) { + // Unblocked + LAPACK(slauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_slauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(ssyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(ssyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_slauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c new file mode 100644 index 000000000..ee0a5546e --- /dev/null +++ b/relapack/src/spbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, + float *, const int *, float *, const int *, int *); + + +/** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's spbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html + * */ +void RELAPACK_spbtrf( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const float ZERO[] = { 0. }; + + // Allocate work space + const int n1 = SREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + float *Work = malloc(mWork * nWork * sizeof(float)); + LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_spbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** spbtrf's recursive compute kernel */ +static void RELAPACK_spbtrf_rec( + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + float *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { + // Unblocked + LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + float *const A = Ab + ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(SREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + float *const Ab_BR = Ab + *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_spotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + float *const A_TRl = A_TR; + float *const A_TRr = A_TR + *ldA * n21; + float *const A_BLt = A_BL; + float *const A_BLb = A_BL + n21; + float *const A_BRtl = A_BR; + float *const A_BRtr = A_BR + *ldA * n21; + float *const A_BRbl = A_BR + n21; + float *const A_BRbr = A_BR + *ldA * n21 + n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(strsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(ssyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(slacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(strsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(sgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(ssyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(slacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(strsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(ssyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(slacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(strsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(sgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(ssyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(slacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c new file mode 100644 index 000000000..2a609321b --- /dev/null +++ b/relapack/src/spotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_spotrf_rec(const char *, const int *, float *, + const int *, int *); + + +/** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's spotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html + * */ +void RELAPACK_spotrf( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_spotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** spotrf's recursive compute kernel */ +static void RELAPACK_spotrf_rec( + const char *uplo, const int *n, + float *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_SPOTRF, 1)) { + // Unblocked + LAPACK(spotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_spotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(ssyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(ssyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_spotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c new file mode 100644 index 000000000..7f145cdec --- /dev/null +++ b/relapack/src/ssygst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_ssygst_rec(const int *, const char *, const int *, + float *, const int *, const float *, const int *, + float *, const int *, int *); + + +/** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's ssygst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html + * */ +void RELAPACK_ssygst( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SSYGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + float *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = SREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * sizeof(float)); + if (!Work) + lWork = 0; +#endif + + // Recursive kernel + RELAPACK_ssygst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** ssygst's recursive compute kernel */ +static void RELAPACK_ssygst_rec( + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + float *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_SSYGST, 1)) { + // Unblocked + LAPACK(ssygs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const float ZERO[] = { 0. }; + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const float HALF[] = { .5 }; + const float MHALF[] = { -.5 }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // B_TL B_TR + // B_BL B_BR + const float *const B_TL = B; + const float *const B_TR = B + *ldB * n1; + const float *const B_BL = B + n1; + const float *const B_BR = B + *ldB * n1 + n1; + + // recursion(A_TL, B_TL) + RELAPACK_ssygst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(strsm)("R", "L", "T", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(ssyr2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(ssymm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(strsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(strsm)("L", "U", "T", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(ssyr2k)("U", "T", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork > n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(ssymm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(strsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(strmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(ssyr2k)("L", "T", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(saxpy)(&n2, ONE, Work + n2 * i, iONE, A_BL + *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(ssymm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(strmm)("L", "L", "T", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(strmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork > n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(ssyr2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork > n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(saxpy)(&n1, ONE, Work + n1 * i, iONE, A_TR + *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(ssymm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(strmm)("R", "U", "T", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_ssygst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c new file mode 100644 index 000000000..8a4fad9f2 --- /dev/null +++ b/relapack/src/ssytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's ssytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html + * */ +void RELAPACK_ssytrf( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** ssytrf's recursive compute kernel */ +static void RELAPACK_ssytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_SSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_ssytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = SREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + float *const A_BL_B = A + *n; + float *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + n1; + float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = SREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + *ldA * n_rest; + float *const A_TR_T = A + *ldA * (n_rest + n1); + float *const A_TL = A + *ldA * n_rest + n_rest; + float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c new file mode 100644 index 000000000..edc9269ec --- /dev/null +++ b/relapack/src/ssytrf_rec2.c @@ -0,0 +1,351 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static float c_b8 = -1.f; +static float c_b9 = 1.f; + +/** SSYTRF_REC2 computes a partial factorization of a real symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's slasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int * + nb, int *kb, float *a, int *lda, int *ipiv, float *w, + int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + float r__1, r__2, r__3; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k; + static float t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *), + sgemv_(char *, int *, int *, float *, float *, int *, + float *, int *, float *, float *, int *, ftnlen); + static int kstep; + extern /* Subroutine */ int scopy_(int *, float *, int *, float *, + int *), sswap_(int *, float *, int *, float *, int * + ); + static float absakk; + extern int isamax_(int *, float *, int *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b9, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b8, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b9, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); + if (imax > 1) { + i__1 = imax - 1; + jmax = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + r__2 = rowmax, r__3 = (r__1 = w[jmax + (kw - 1) * w_dim1], + dabs(r__1)); + rowmax = dmax(r__2,r__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) >= + alpha * rowmax) { + kp = imax; + scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kk - 1 - kp; + scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + scopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + sswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + r1 = 1.f / a[k + k * a_dim1]; + i__1 = k - 1; + sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + d21 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d21; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d21; + t = 1.f / (d11 * d22 - 1.f); + d21 = t / d21; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = d21 * (d11 * w[j + (kw - 1) + * w_dim1] - w[j + kw * w_dim1]); + a[j + k * a_dim1] = d21 * (d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + sswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b9, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b8, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b9, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + r__2 = rowmax, r__3 = (r__1 = w[jmax + (k + 1) * w_dim1], + dabs(r__1)); + rowmax = dmax(r__2,r__3); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else if ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) >= + alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + kk = k + kstep - 1; + if (kp != kk) { + a[kp + kp * a_dim1] = a[kk + kk * a_dim1]; + i__1 = kp - kk - 1; + scopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + sswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + r1 = 1.f / a[k + k * a_dim1]; + i__1 = *n - k; + sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1.f / (d11 * d22 - 1.f); + d21 = t / d21; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = d21 * (d11 * w[j + k * w_dim1] - + w[j + (k + 1) * w_dim1]); + a[j + (k + 1) * a_dim1] = d21 * (d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + sswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c new file mode 100644 index 000000000..040df2484 --- /dev/null +++ b/relapack/src/ssytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); + + +/** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's ssytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html + * */ +void RELAPACK_ssytrf_rook( + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + float *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * sizeof(float)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** ssytrf_rook's recursive compute kernel */ +static void RELAPACK_ssytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(ssytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_ssytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = SREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + float *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + float *const A_BL_B = A + *n; + float *const A_BR_B = A + *ldA * n1 + *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + float *const Work_BL = Work + n1; + float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; + + // last row of A_BL + float *const A_BL_b = A_BL + n2_out; + + // last row of Work_BL + float *const Work_BL_b = Work_BL + n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(sgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = SREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + float *const Work_R = top ? Work : Work + *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + float *const A_TL_T = A + *ldA * n_rest; + float *const A_TR_T = A + *ldA * (n_rest + n1); + float *const A_TL = A + *ldA * n_rest + n_rest; + float *const A_TR = A + *ldA * (n_rest + n1) + n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + float *const Work_L = Work; + float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c new file mode 100644 index 000000000..3308826d7 --- /dev/null +++ b/relapack/src/ssytrf_rook_rec2.c @@ -0,0 +1,451 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static float c_b9 = -1.f; +static float c_b10 = 1.f; + +/** SSYTRF_ROOK_REC2 computes a partial factorization of a real symmetric matrix using the bounded Bunch-Kaufma n ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's slasyf_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, float *a, int *lda, int *ipiv, float * + w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + float r__1; + + /* Builtin functions */ + double sqrt(double); + + /* Local variables */ + static int j, k, p; + static float t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static float alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *); + static float sfmin; + static int itemp; + extern /* Subroutine */ int sgemv_(char *, int *, int *, float *, + float *, int *, float *, int *, float *, float *, int *, + ftnlen); + static int kstep; + static float stemp; + extern /* Subroutine */ int scopy_(int *, float *, int *, float *, + int *), sswap_(int *, float *, int *, float *, int * + ); + static float absakk; + extern double slamch_(char *, ftnlen); + extern int isamax_(int *, float *, int *); + static float colmax, rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.f) + 1.f) / 8.f; + sfmin = slamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + scopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b10, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + absakk = (r__1 = w[k + kw * w_dim1], dabs(r__1)); + if (k > 1) { + i__1 = k - 1; + imax = isamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + colmax = (r__1 = w[imax + kw * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + scopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + scopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + sgemv_("No transpose", &k, &i__1, &c_b9, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b10, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + isamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + rowmax = (r__1 = w[jmax + (kw - 1) * w_dim1], dabs(r__1)); + } else { + rowmax = 0.f; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = isamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + stemp = (r__1 = w[itemp + (kw - 1) * w_dim1], dabs(r__1)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + if (! ((r__1 = w[imax + (kw - 1) * w_dim1], dabs(r__1)) < + alpha * rowmax)) { + kp = imax; + scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + scopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + scopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + scopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + sswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + sswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = k - 1 - kp; + scopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + scopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + sswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + sswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + scopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { + r1 = 1.f / a[k + k * a_dim1]; + i__1 = k - 1; + sscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else if (a[k + k * a_dim1] != 0.f) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L14: */ + } + } + } + } else { + if (k > 2) { + d12 = w[k - 1 + kw * w_dim1]; + d11 = w[k + kw * w_dim1] / d12; + d22 = w[k - 1 + (kw - 1) * w_dim1] / d12; + t = 1.f / (d11 * d22 - 1.f); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + a[j + (k - 1) * a_dim1] = t * ((d11 * w[j + (kw - 1) * + w_dim1] - w[j + kw * w_dim1]) / d12); + a[j + k * a_dim1] = t * ((d22 * w[j + kw * w_dim1] - + w[j + (kw - 1) * w_dim1]) / d12); +/* L20: */ + } + } + a[k - 1 + (k - 1) * a_dim1] = w[k - 1 + (kw - 1) * w_dim1]; + a[k - 1 + k * a_dim1] = w[k - 1 + kw * w_dim1]; + a[k + k * a_dim1] = w[k + kw * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + sswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + sswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + scopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b10, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + absakk = (r__1 = w[k + k * w_dim1], dabs(r__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + isamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + colmax = (r__1 = w[imax + k * w_dim1], dabs(r__1)); + } else { + colmax = 0.f; + } + if (dmax(absakk,colmax) == 0.f) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + scopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + scopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + sgemv_("No transpose", &i__1, &i__2, &c_b9, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b10, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + isamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + rowmax = (r__1 = w[jmax + (k + 1) * w_dim1], dabs(r__1)); + } else { + rowmax = 0.f; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + isamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + stemp = (r__1 = w[itemp + (k + 1) * w_dim1], dabs(r__1)); + if (stemp > rowmax) { + rowmax = stemp; + jmax = itemp; + } + } + if (! ((r__1 = w[imax + (k + 1) * w_dim1], dabs(r__1)) < + alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + scopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + scopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + sswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + sswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + a[kp + k * a_dim1] = a[kk + k * a_dim1]; + i__1 = kp - k - 1; + scopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + scopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + sswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + sswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + scopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + if ((r__1 = a[k + k * a_dim1], dabs(r__1)) >= sfmin) { + r1 = 1.f / a[k + k * a_dim1]; + i__1 = *n - k; + sscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else if (a[k + k * a_dim1] != 0.f) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + a[ii + k * a_dim1] /= a[k + k * a_dim1]; +/* L74: */ + } + } + } + } else { + if (k < *n - 1) { + d21 = w[k + 1 + k * w_dim1]; + d11 = w[k + 1 + (k + 1) * w_dim1] / d21; + d22 = w[k + k * w_dim1] / d21; + t = 1.f / (d11 * d22 - 1.f); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + a[j + k * a_dim1] = t * ((d11 * w[j + k * w_dim1] - w[ + j + (k + 1) * w_dim1]) / d21); + a[j + (k + 1) * a_dim1] = t * ((d22 * w[j + (k + 1) * + w_dim1] - w[j + k * w_dim1]) / d21); +/* L80: */ + } + } + a[k + k * a_dim1] = w[k + k * w_dim1]; + a[k + 1 + k * a_dim1] = w[k + 1 + k * w_dim1]; + a[k + 1 + (k + 1) * a_dim1] = w[k + 1 + (k + 1) * w_dim1]; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + sswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + sswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c new file mode 100644 index 000000000..1870fb928 --- /dev/null +++ b/relapack/src/stgsyl.c @@ -0,0 +1,274 @@ +#include "relapack.h" +#include <math.h> + +static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, + const int *, const float *, const int *, const float *, const int *, + float *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, float *, float *, int *, int *, + int *); + + +/** STGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's stgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html + * */ +void RELAPACK_stgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dif, + float *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "T"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("STGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'T'; + + // Constant + const float ZERO[] = { 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + float scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + float dscale = 0; + float dsum = 1; + int pq; + RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(pq) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(slacpy)("F", m, n, C, ldC, Work, m); + LAPACK(slacpy)("F", m, n, F, ldF, Work + *m * *n, m); + LAPACK(slaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(slaset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(slacpy)("F", m, n, Work, m, C, ldC); + LAPACK(slacpy)("F", m, n, Work + *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** stgsyl's recursive vompute kernel */ +static void RELAPACK_stgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, + float *scale, float *dsum, float *dscale, + int *iWork, int *pq, int *info +) { + + if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) { + // Unblocked + LAPACK(stgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, iWork, pq, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1. }; + float scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = SREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + *ldA * m1; + const float *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + m1; + + // D_TL D_TR + // 0 D_BR + const float *const D_TL = D; + const float *const D_TR = D + *ldD * m1; + const float *const D_BR = D + *ldD * m1 + m1; + + // F_T + // F_B + float *const F_T = F; + float *const F_B = F + m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_T = C_T - A_TR * C_B + BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_stgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(sgemm)("T", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_stgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + int n1 = SREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + *ldB * n1; + const float *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const float *const E_TL = E; + const float *const E_TR = E + *ldE * n1; + const float *const E_BR = E + *ldE * n1 + n1; + + // F_L F_R + float *const F_L = F; + float *const F_R = F + *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, iWork, pq, info1); + // C_R = C_R + F_L * B_TR + BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(sgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, iWork, pq, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(sgemm)("N", "T", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_stgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, iWork, pq, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c new file mode 100644 index 000000000..83947ef1a --- /dev/null +++ b/relapack/src/strsyl.c @@ -0,0 +1,169 @@ +#include "relapack.h" + +static void RELAPACK_strsyl_rec(const char *, const char *, const int *, + const int *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, int *); + + +/** STRSYL solves the real Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's strsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html + * */ +void RELAPACK_strsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int transA = LAPACK(lsame)(tranA, "T"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int transB = LAPACK(lsame)(tranB, "T"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!transA && !ctransA && !notransA) + *info = -1; + else if (!transB && !ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("STRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); + const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_strsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** strsyl's recursive compute kernel */ +static void RELAPACK_strsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) { + // Unblocked + RELAPACK_strsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + const float MSGN[] = { -*isgn }; + const int iONE[] = { 1 }; + + // Outputs + float scale1[] = { 1. }; + float scale2[] = { 1. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + int m1 = SREC_SPLIT(*m); + if (A[m1 + *ldA * (m1 - 1)]) + m1++; + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const float *const A_TL = A; + const float *const A_TR = A + *ldA * m1; + const float *const A_BR = A + *ldA * m1 + m1; + + // C_T + // C_B + float *const C_T = C; + float *const C_B = C + m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(sgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(sgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_strsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + int n1 = SREC_SPLIT(*n); + if (B[n1 + *ldB * (n1 - 1)]) + n1++; + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const float *const B_TL = B; + const float *const B_TR = B + *ldB * n1; + const float *const B_BR = B + *ldB * n1 + n1; + + // C_L C_R + float *const C_L = C; + float *const C_R = C + *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(sgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(sgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_strsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(slascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c new file mode 100644 index 000000000..6d40a475d --- /dev/null +++ b/relapack/src/strsyl_rec2.c @@ -0,0 +1,1029 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static int c__1 = 1; +static int c_false = FALSE_; +static int c__2 = 2; +static float c_b26 = 1.f; +static float c_b30 = 0.f; +static int c_true = TRUE_; + +void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int + *m, int *n, float *a, int *lda, float *b, int *ldb, float * + c__, int *ldc, float *scale, int *info, ftnlen trana_len, + ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + float r__1, r__2; + + /* Local variables */ + static int j, k, l; + static float x[4] /* was [2][2] */; + static int k1, k2, l1, l2; + static float a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; + static int ierr; + static float smin; + extern float sdot_(int *, float *, int *, float *, int *); + static float suml, sumr; + extern int lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *); + static int knext, lnext; + static float xnorm; + extern /* Subroutine */ int slaln2_(int *, int *, int *, float + *, float *, float *, int *, float *, float *, float *, int *, + float *, float *, float *, int *, float *, float *, int *), + slasy2_(int *, int *, int *, int *, int *, + float *, int *, float *, int *, float *, int *, float *, + float *, int *, float *, int *), slabad_(float *, float *); + static float scaloc; + extern float slamch_(char *, ftnlen), slange_(char *, int *, + int *, float *, int *, float *, ftnlen); + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + static float bignum; + static int notrna, notrnb; + static float smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "T", (ftnlen)1, (ftnlen)1) && ! lsame_( + trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "T", (ftnlen)1, (ftnlen)1) && ! + lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("STRSYL", &i__1, (ftnlen)6); + return; + } + *scale = 1.f; + if (*m == 0 || *n == 0) { + return; + } + eps = slamch_("P", (ftnlen)1); + smlnum = slamch_("S", (ftnlen)1); + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); + smlnum = smlnum * (float) (*m * *n) / eps; + bignum = 1.f / smlnum; +/* Computing MAX */ + r__1 = smlnum, r__2 = eps * slange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * slange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = dmax(r__1,r__2); + sgn = (float) (*isgn); + if (notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L70; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.f) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L60; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.f) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L20: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = *m - k1; +/* Computing MIN */ + i__3 = k1 + 1; +/* Computing MIN */ + i__4 = k1 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k1 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l1 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = *m - k2; +/* Computing MIN */ + i__3 = k2 + 1; +/* Computing MIN */ + i__4 = k2 + 1; + suml = sdot_(&i__2, &a[k2 + min(i__3,*m) * a_dim1], lda, & + c__[min(i__4,*m) + l2 * c_dim1], &c__1); + i__2 = l1 - 1; + sumr = sdot_(&i__2, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_false, &c_false, isgn, &c__2, &c__2, &a[k1 + + k1 * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, + &c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L50: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L60: + ; + } +L70: + ; + } + } else if (! notrna && notrnb) { + lnext = 1; + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + if (l < lnext) { + goto L130; + } + if (l == *n) { + l1 = l; + l2 = l; + } else { + if (b[l + 1 + l * b_dim1] != 0.f) { + l1 = l; + l2 = l + 1; + lnext = l + 2; + } else { + l1 = l; + l2 = l; + lnext = l + 1; + } + } + knext = 1; + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + if (k < knext) { + goto L120; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.f) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L80: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L90: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 * + b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k1 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l1 * + b_dim1 + 1], &c__1); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__3 = k1 - 1; + suml = sdot_(&i__3, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__3 = l1 - 1; + sumr = sdot_(&i__3, &c__[k2 + c_dim1], ldc, &b[l2 * + b_dim1 + 1], &c__1); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_true, &c_false, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L110: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L120: + ; + } +L130: + ; + } + } else if (! notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L190; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.f) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = 1; + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + if (k < knext) { + goto L180; + } + if (k == *m) { + k1 = k; + k2 = k; + } else { + if (a[k + 1 + k * a_dim1] != 0.f) { + k1 = k; + k2 = k + 1; + knext = k + 2; + } else { + k1 = k; + k2 = k; + knext = k + 1; + } + } + if (l1 == l2 && k1 == k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l1; +/* Computing MIN */ + i__3 = l1 + 1; +/* Computing MIN */ + i__4 = l1 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L140: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_true, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 * + a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L150: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L160: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k1 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k1 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l1 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l1 + min(i__4,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__2 = k1 - 1; + suml = sdot_(&i__2, &a[k2 * a_dim1 + 1], &c__1, &c__[l2 * + c_dim1 + 1], &c__1); + i__2 = *n - l2; +/* Computing MIN */ + i__3 = l2 + 1; +/* Computing MIN */ + i__4 = l2 + 1; + sumr = sdot_(&i__2, &c__[k2 + min(i__3,*n) * c_dim1], ldc, + &b[l2 + min(i__4,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_true, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 * + a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L170: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L180: + ; + } +L190: + ; + } + } else if (notrna && ! notrnb) { + lnext = *n; + for (l = *n; l >= 1; --l) { + if (l > lnext) { + goto L250; + } + if (l == 1) { + l1 = l; + l2 = l; + } else { + if (b[l + (l - 1) * b_dim1] != 0.f) { + l1 = l - 1; + l2 = l; + lnext = l - 2; + } else { + l1 = l; + l2 = l; + lnext = l - 1; + } + } + knext = *m; + for (k = *m; k >= 1; --k) { + if (k > knext) { + goto L240; + } + if (k == 1) { + k1 = k; + k2 = k; + } else { + if (a[k + (k - 1) * a_dim1] != 0.f) { + k1 = k - 1; + k2 = k; + knext = k - 2; + } else { + k1 = k; + k2 = k; + knext = k - 1; + } + } + if (l1 == l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l1; +/* Computing MIN */ + i__2 = l1 + 1; +/* Computing MIN */ + i__3 = l1 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + scaloc = 1.f; + a11 = a[k1 + k1 * a_dim1] + sgn * b[l1 + l1 * b_dim1]; + da11 = dabs(a11); + if (da11 <= smin) { + a11 = smin; + da11 = smin; + *info = 1; + } + db = dabs(vec[0]); + if (da11 < 1.f && db > 1.f) { + if (db > bignum * da11) { + scaloc = 1.f / db; + } + } + x[0] = vec[0] * scaloc / a11; + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L200: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + } else if (l1 == l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + r__1 = -sgn * b[l1 + l1 * b_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &a[k1 + k1 + * a_dim1], lda, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L210: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k2 + l1 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 == k2) { + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = sgn * (c__[k1 + l1 * c_dim1] - (suml + sgn * + sumr)); + i__1 = *m - k1; +/* Computing MIN */ + i__2 = k1 + 1; +/* Computing MIN */ + i__3 = k1 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[1] = sgn * (c__[k1 + l2 * c_dim1] - (suml + sgn * + sumr)); + r__1 = -sgn * a[k1 + k1 * a_dim1]; + slaln2_(&c_false, &c__2, &c__1, &smin, &c_b26, &b[l1 + l1 + * b_dim1], ldb, &c_b26, &c_b26, vec, &c__2, &r__1, + &c_b30, x, &c__2, &scaloc, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L220: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[1]; + } else if (l1 != l2 && k1 != k2) { + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[0] = c__[k1 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k1 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k1 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[2] = c__[k1 + l2 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l1 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l1 + min(i__3,*n) * b_dim1], ldb); + vec[1] = c__[k2 + l1 * c_dim1] - (suml + sgn * sumr); + i__1 = *m - k2; +/* Computing MIN */ + i__2 = k2 + 1; +/* Computing MIN */ + i__3 = k2 + 1; + suml = sdot_(&i__1, &a[k2 + min(i__2,*m) * a_dim1], lda, & + c__[min(i__3,*m) + l2 * c_dim1], &c__1); + i__1 = *n - l2; +/* Computing MIN */ + i__2 = l2 + 1; +/* Computing MIN */ + i__3 = l2 + 1; + sumr = sdot_(&i__1, &c__[k2 + min(i__2,*n) * c_dim1], ldc, + &b[l2 + min(i__3,*n) * b_dim1], ldb); + vec[3] = c__[k2 + l2 * c_dim1] - (suml + sgn * sumr); + slasy2_(&c_false, &c_true, isgn, &c__2, &c__2, &a[k1 + k1 + * a_dim1], lda, &b[l1 + l1 * b_dim1], ldb, vec, & + c__2, &scaloc, x, &c__2, &xnorm, &ierr); + if (ierr != 0) { + *info = 1; + } + if (scaloc != 1.f) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + sscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L230: */ + } + *scale *= scaloc; + } + c__[k1 + l1 * c_dim1] = x[0]; + c__[k1 + l2 * c_dim1] = x[2]; + c__[k2 + l1 * c_dim1] = x[1]; + c__[k2 + l2 * c_dim1] = x[3]; + } +L240: + ; + } +L250: + ; + } + } +} diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c new file mode 100644 index 000000000..d35bbd49f --- /dev/null +++ b/relapack/src/strtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_strtri_rec(const char *, const char *, const int *, + float *, const int *, int *); + + +/** CTRTRI computes the inverse of a real upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's strtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html + * */ +void RELAPACK_strtri( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("STRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[i + *ldA * i] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_strtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** strtri's recursive compute kernel */ +static void RELAPACK_strtri_rec( + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_STRTRI, 1)) { + // Unblocked + LAPACK(strti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const float ONE[] = { 1. }; + const float MONE[] = { -1. }; + + // Splitting + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + float *const A_TL = A; + float *const A_TR = A + *ldA * n1; + float *const A_BL = A + n1; + float *const A_BR = A + *ldA * n1 + n1; + + // recursion(A_TL) + RELAPACK_strtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(strmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(strsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(strmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(strsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_strtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c new file mode 100644 index 000000000..3aa6bf531 --- /dev/null +++ b/relapack/src/zgbtrf.c @@ -0,0 +1,230 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, + const int *, double *, const int *, int *, double *, const int *, double *, + const int *, int *); + + +/** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's zgbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html + * */ +void RELAPACK_zgbtrf( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kl < 0) + *info = -3; + else if (*ku < 0) + *info = -4; + else if (*ldAb < 2 * *kl + *ku + 1) + *info = -6; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZGBTRF", &minfo); + return; + } + + // Constant + const double ZERO[] = { 0., 0. }; + + // Result upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + 2 * kv; + + // Zero upper diagonal fill-in elements + int i, j; + for (j = 0; j < *n; j++) { + double *const A_j = A + 2 * *ldA * j; + for (i = MAX(0, j - kv); i < j - *ku; i++) + A_j[2 * i] = A_j[2 * i + 1] = 0.; + } + + // Allocate work space + const int n1 = ZREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); + double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); + LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); + LAPACK(zlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); + + // Recursive kernel + RELAPACK_zgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); + + // Free work space + free(Workl); + free(Worku); +} + + +/** zgbtrf's recursive compute kernel */ +static void RELAPACK_zgbtrf_rec( + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { + // Unblocked + LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterators + int i, j; + + // Output upper band width + const int kv = *ku + *kl; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + 2 * kv; + + // Splitting + const int n1 = MIN(ZREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); + + // Ab_L * + // Ab_BR + double *const Ab_L = Ab; + double *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * m1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * m1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // Banded splitting + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); + + // n1 n21 n22 + // m * A_Rl ARr + double *const A_Rl = A_R; + double *const A_Rr = A_R + 2 * *ldA * n21; + + // n1 n21 n22 + // m1 * A_TRl A_TRr + // m21 A_BLt A_BRtl A_BRtr + // m22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + 2 * *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + 2 * m21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + 2 * *ldA * n21; + double *const A_BRbl = A_BR + 2 * m21; + double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; + + // recursion(Ab_L, ipiv_T) + RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); + + // Workl = A_BLb + LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); + + // partially redo swaps in A_L + for (i = 0; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // apply pivots to A_Rl + LAPACK(zlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); + + // apply pivots to A_Rr columnwise + for (j = 0; j < n22; j++) { + double *const A_Rrj = A_Rr + 2 * *ldA * j; + for (i = j; i < mn1; i++) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + const double tmpr = A_Rrj[2 * i]; + const double tmpc = A_Rrj[2 * i + 1]; + A_Rrj[2 * i] = A_Rrj[2 * ip]; + A_Rrj[2 * i + 1] = A_Rrj[2 * ip + 1]; + A_Rrj[2 * ip] = tmpr; + A_Rrj[2 * ip + 1] = tmpc; + } + } + } + + // A_TRl = A_TL \ A_TRl + BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // Worku = A_TRr + LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); + // Worku = A_TL \ Worku + BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + // A_TRr = Worku + LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); + // A_BRtl = A_BRtl - A_BLt * A_TRl + BLAS(zgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); + // A_BRbl = A_BRbl - Workl * A_TRl + BLAS(zgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); + // A_BRtr = A_BRtr - A_BLt * Worku + BLAS(zgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Workl * Worku + BLAS(zgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); + + // partially undo swaps in A_L + for (i = mn1 - 1; i >= 0; i--) { + const int ip = ipiv_T[i] - 1; + if (ip != i) { + if (ip < *kl) + BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); + else + BLAS(zswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); + } + } + + // recursion(Ab_BR, ipiv_B) + RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); + if (*info) + *info += n1; + // shift pivots + for (i = 0; i < mn2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c new file mode 100644 index 000000000..aa5930238 --- /dev/null +++ b/relapack/src/zgemmt.c @@ -0,0 +1,167 @@ +#include "relapack.h" + +static void RELAPACK_zgemmt_rec(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + +static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); + + +/** ZGEMMT computes a matrix-matrix product with general matrices but updates + * only the upper or lower triangular part of the result matrix. + * + * This routine performs the same operation as the BLAS routine + * zgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) + * but only updates the triangular part of C specified by uplo: + * If (*uplo == 'L'), only the lower triangular part of C is updated, + * otherwise the upper triangular part is updated. + * */ +void RELAPACK_zgemmt( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + +#if HAVE_XGEMMT + BLAS(zgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; +#else + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int ctransA = LAPACK(lsame)(transA, "C"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + const int ctransB = LAPACK(lsame)(transB, "C"); + int info = 0; + if (!lower && !upper) + info = 1; + else if (!tranA && !ctransA && !notransA) + info = 2; + else if (!tranB && !ctransB && !notransB) + info = 3; + else if (*n < 0) + info = 4; + else if (*k < 0) + info = 5; + else if (*ldA < MAX(1, notransA ? *n : *k)) + info = 8; + else if (*ldB < MAX(1, notransB ? *k : *n)) + info = 10; + else if (*ldC < MAX(1, *n)) + info = 13; + if (info) { + LAPACK(xerbla)("ZGEMMT", &info); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); + const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); + + // Recursive kernel + RELAPACK_zgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); +#endif +} + + +/** zgemmt's recursive compute kernel */ +static void RELAPACK_zgemmt_rec( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) { + // Unblocked + RELAPACK_zgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + return; + } + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_T + // A_B + const double *const A_T = A; + const double *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); + + // B_L B_R + const double *const B_L = B; + const double *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); + + // C_TL C_TR + // C_BL C_BR + double *const C_TL = C; + double *const C_TR = C + 2 * *ldC * n1; + double *const C_BL = C + 2 * n1; + double *const C_BR = C + 2 * *ldC * n1 + 2 * n1; + + // recursion(C_TL) + RELAPACK_zgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); + + if (*uplo == 'L') + // C_BL = alpha A_B B_L + beta C_BL + BLAS(zgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); + else + // C_TR = alpha A_T B_R + beta C_TR + BLAS(zgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); + + // recursion(C_BR) + RELAPACK_zgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); +} + + +/** zgemmt's unblocked compute kernel */ +static void RELAPACK_zgemmt_rec2( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC +) { + + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; + + int i; + for (i = 0; i < *n; i++) { + // A_0 + // A_i + const double *const A_0 = A; + const double *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); + + // * B_i * + const double *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); + + // * C_0i * + // * C_ii * + double *const C_0i = C + 2 * *ldC * i; + double *const C_ii = C + 2 * *ldC * i + 2 * i; + + if (*uplo == 'L') { + const int nmi = *n - i; + if (*transA == 'N') + BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + else + BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); + } else { + const int ip1 = i + 1; + if (*transA == 'N') + BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + else + BLAS(zgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); + } + } +} diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c new file mode 100644 index 000000000..cf8921e1f --- /dev/null +++ b/relapack/src/zgetrf.c @@ -0,0 +1,117 @@ +#include "relapack.h" + +static void RELAPACK_zgetrf_rec(const int *, const int *, double *, + const int *, int *, int *); + + +/** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. + * + * This routine is functionally equivalent to LAPACK's zgetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html + * */ +void RELAPACK_zgetrf( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + // Check arguments + *info = 0; + if (*m < 0) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZGETRF", &minfo); + return; + } + + const int sn = MIN(*m, *n); + + RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); + + // Right remainder + if (*m < *n) { + // Constants + const double ONE[] = { 1., 0. }; + const int iONE[] = { 1 }; + + // Splitting + const int rn = *n - *m; + + // A_L A_R + const double *const A_L = A; + double *const A_R = A + 2 * *ldA * *m; + + // A_R = apply(ipiv, A_R) + LAPACK(zlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); + // A_R = A_L \ A_R + BLAS(ztrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); + } +} + + +/** zgetrf's recursive compute kernel */ +static void RELAPACK_zgetrf_rec( + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { + // Unblocked + LAPACK(zgetf2)(m, n, A, ldA, ipiv, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + + // A_L A_R + double *const A_L = A; + double *const A_R = A + 2 * *ldA * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // ipiv_T + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; + + // recursion(A_L, ipiv_T) + RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + // apply pivots to A_R + LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); + + // A_TR = A_TL \ A_TR + BLAS(ztrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_BL * A_TR + BLAS(zgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); + + // recursion(A_BR, ipiv_B) + RELAPACK_zgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); + if (*info) + *info += n1; + // apply pivots to A_BL + LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); + // shift pivots + int i; + for (i = 0; i < n2; i++) + ipiv_B[i] += n1; +} diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c new file mode 100644 index 000000000..d0ece2148 --- /dev/null +++ b/relapack/src/zhegst.c @@ -0,0 +1,212 @@ +#include "relapack.h" +#if XSYGST_ALLOW_MALLOC +#include "stdlib.h" +#endif + +static void RELAPACK_zhegst_rec(const int *, const char *, const int *, + double *, const int *, const double *, const int *, + double *, const int *, int *); + + +/** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. + * + * This routine is functionally equivalent to LAPACK's zhegst. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html + * */ +void RELAPACK_zhegst( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (*itype < 1 || *itype > 3) + *info = -1; + else if (!lower && !upper) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + else if (*ldB < MAX(1, *n)) + *info = -7; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZHEGST", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Allocate work space + double *Work = NULL; + int lWork = 0; +#if XSYGST_ALLOW_MALLOC + const int n1 = ZREC_SPLIT(*n); + lWork = n1 * (*n - n1); + Work = malloc(lWork * 2 * sizeof(double)); + if (!Work) + lWork = 0; +#endif + + // recursive kernel + RELAPACK_zhegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); + + // Free work space +#if XSYGST_ALLOW_MALLOC + if (Work) + free(Work); +#endif +} + + +/** zhegst's recursive compute kernel */ +static void RELAPACK_zhegst_rec( + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + double *Work, const int *lWork, int *info +) { + + if (*n <= MAX(CROSSOVER_ZHEGST, 1)) { + // Unblocked + LAPACK(zhegs2)(itype, uplo, n, A, ldA, B, ldB, info); + return; + } + + // Constants + const double ZERO[] = { 0., 0. }; + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const double HALF[] = { .5, 0. }; + const double MHALF[] = { -.5, 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // B_TL B_TR + // B_BL B_BR + const double *const B_TL = B; + const double *const B_TR = B + 2 * *ldB * n1; + const double *const B_BL = B + 2 * n1; + const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // recursion(A_TL, B_TL) + RELAPACK_zhegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); + + if (*itype == 1) + if (*uplo == 'L') { + // A_BL = A_BL / B_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork >= n2 * n1) { + // T = -1/2 * B_BL * A_TL + BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' + BLAS(zher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); + if (*lWork >= n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL - 1/2 B_BL * A_TL + BLAS(zhemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR \ A_BL + BLAS(ztrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL' \ A_TR + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork >= n2 * n1) { + // T = -1/2 * A_TL * B_TR + BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR + BLAS(zher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); + if (*lWork >= n2 * n1) + // A_TR = A_BL + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR - 1/2 A_TL * B_TR + BLAS(zhemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR / B_BR + BLAS(ztrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + else + if (*uplo == 'L') { + // A_BL = A_BL * B_TL + BLAS(ztrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); + if (*lWork >= n2 * n1) { + // T = 1/2 * A_BR * B_BL + BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + } else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL + BLAS(zher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); + if (*lWork >= n2 * n1) + // A_BL = A_BL + T + for (i = 0; i < n1; i++) + BLAS(zaxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); + else + // A_BL = A_BL + 1/2 A_BR * B_BL + BLAS(zhemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); + // A_BL = B_BR * A_BL + BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); + } else { + // A_TR = B_TL * A_TR + BLAS(ztrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); + if (*lWork >= n2 * n1) { + // T = 1/2 * B_TR * A_BR + BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + } else + // A_TR = A_TR + 1/2 B_TR A_BR + BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' + BLAS(zher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); + if (*lWork >= n2 * n1) + // A_TR = A_TR + T + for (i = 0; i < n2; i++) + BLAS(zaxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); + else + // A_TR = A_TR + 1/2 B_TR * A_BR + BLAS(zhemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); + // A_TR = A_TR * B_BR + BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); + } + + // recursion(A_BR, B_BR) + RELAPACK_zhegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); +} diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c new file mode 100644 index 000000000..ef4e1f5d5 --- /dev/null +++ b/relapack/src/zhetrf.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zhetrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html + * */ +void RELAPACK_zhetrf( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zhetrf's recursive compute kernel */ +static void RELAPACK_zhetrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZHETRF, 3)) { + // Unblocked + if (top) { + LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zhetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c new file mode 100644 index 000000000..867ea64e1 --- /dev/null +++ b/relapack/src/zhetrf_rec2.c @@ -0,0 +1,524 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method + * + * This routine is a minor modification of LAPACK's zlahef. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int * + nb, int *kb, doublecomplex *a, int *lda, int *ipiv, + doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2, d__3, d__4; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, + doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k; + static double t, r1; + static doublecomplex d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + static int kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + static double absakk; + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); + static double colmax; + extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + ; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + i__1 = k - 1; + zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - 1; + zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (kw - 1) * w_dim1]), abs(d__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kk - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + r1 = 1. / a[i__1].r; + i__1 = k - 1; + zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + kw * w_dim1], &z__2); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + z__2.r = t, z__2.i = 0.; + z_div(&z__1, &z__2, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + d_cnjg(&z__2, &d21); + i__3 = j + kw * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = + z__2.r * z__3.i + z__2.i * z__3.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (imax < *n) { + i__1 = *n - imax; + zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (k + 1) * w_dim1]), abs(d__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) >= alpha * rowmax) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kp - kk - 1; + zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + r1 = 1. / a[i__1].r; + i__1 = *n - k; + zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + k * w_dim1], &z__2); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + z__2.r = t, z__2.i = 0.; + z_div(&z__1, &z__2, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + d_cnjg(&z__2, &d21); + i__3 = j + k * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z__1.r = z__2.r * z__3.r - z__2.i * z__3.i, z__1.i = + z__2.r * z__3.i + z__2.i * z__3.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c new file mode 100644 index 000000000..15ceaeae7 --- /dev/null +++ b/relapack/src/zhetrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zhetrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html + * */ +void RELAPACK_zhetrf_rook( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zhetrf_rook's recursive compute kernel */ +static void RELAPACK_zhetrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(zhetf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zhetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c new file mode 100644 index 000000000..a56ad710b --- /dev/null +++ b/relapack/src/zhetrf_rook_rec2.c @@ -0,0 +1,662 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method + * + * This routine is a minor modification of LAPACK's zlahef_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, doublecomplex *a, int *lda, int * + ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2; + doublecomplex z__1, z__2, z__3, z__4, z__5; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void d_cnjg(doublecomplex *, doublecomplex *), z_div(doublecomplex *, + doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k, p; + static double t, r1; + static doublecomplex d11, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + static double dtemp, sfmin; + static int itemp, kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + extern double dlamch_(char *, ftnlen); + static double absakk; + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); + static double colmax; + extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) + ; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + sfmin = dlamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + if (k > 1) { + i__1 = k - 1; + zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & + c__1); + } + i__1 = k + kw * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + i__1 = k + kw * w_dim1; + i__2 = k + kw * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + d__1 = w[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + if (k > 1) { + i__1 = k - 1; + zcopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], + &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + if (imax > 1) { + i__1 = imax - 1; + zcopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + } + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + i__1 = k - imax; + zlacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + i__1 = imax + (kw - 1) * w_dim1; + i__2 = imax + (kw - 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (kw - 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (kw - 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = k - 1 - p; + zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + i__1 = k - 1 - p; + zlacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); + if (p > 1) { + i__1 = p - 1; + zcopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kk - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + i__1 = kk - 1 - kp; + zlacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (abs(t) >= sfmin) { + r1 = 1. / t; + i__1 = k - 1; + zdscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L14: */ + } + } + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + kw * w_dim1], &z__2); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + d_cnjg(&z__5, &d21); + z_div(&z__2, &z__3, &z__5); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1; + zlacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = k - 2; + zlacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + ++jj; + if (kstep == 2 && jp1 != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = k + k * w_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * + w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + i__1 = k + k * w_dim1; + i__2 = k + k * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + d__1 = w[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + if (k < *n) { + i__1 = *n - k; + zcopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * + a_dim1], &c__1); + } + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = imax - k; + zlacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + imax * a_dim1; + d__1 = a[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + if (imax < *n) { + i__1 = *n - imax; + zcopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ + imax + 1 + (k + 1) * w_dim1], &c__1); + } + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + i__1 = imax + (k + 1) * w_dim1; + i__2 = imax + (k + 1) * w_dim1; + d__1 = w[i__2].r; + w[i__1].r = d__1, w[i__1].i = 0.; + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (k + 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (k + 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) < alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p + p * a_dim1; + i__2 = k + k * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = p - k - 1; + zcopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * + a_dim1], lda); + i__1 = p - k - 1; + zlacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); + if (p < *n) { + i__1 = *n - p; + zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p + * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + } + zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + d__1 = a[i__2].r; + a[i__1].r = d__1, a[i__1].i = 0.; + i__1 = kp - kk - 1; + zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + i__1 = kp - kk - 1; + zlacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + t = a[i__1].r; + if (abs(t) >= sfmin) { + r1 = 1. / t; + i__1 = *n - k; + zdscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + i__3 = ii + k * a_dim1; + z__1.r = a[i__3].r / t, z__1.i = a[i__3].i / t; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L74: */ + } + } + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + d_cnjg(&z__2, &d21); + z_div(&z__1, &w[k + k * w_dim1], &z__2); + d22.r = z__1.r, d22.i = z__1.i; + z__1.r = d11.r * d22.r - d11.i * d22.i, z__1.i = d11.r * + d22.i + d11.i * d22.r; + t = 1. / (z__1.r - 1.); + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + d_cnjg(&z__5, &d21); + z_div(&z__2, &z__3, &z__5); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t * z__2.r, z__1.i = t * z__2.i; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = *n - k; + zlacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = *n - k - 1; + zlacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + --jj; + if (kstep == 2 && jp1 != jj && j >= 1) { + zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c new file mode 100644 index 000000000..490dcc82e --- /dev/null +++ b/relapack/src/zlauum.c @@ -0,0 +1,87 @@ +#include "relapack.h" + +static void RELAPACK_zlauum_rec(const char *, const int *, double *, + const int *, int *); + + +/** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. + * + * This routine is functionally equivalent to LAPACK's zlauum. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html + * */ +void RELAPACK_zlauum( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZLAUUM", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_zlauum_rec(&cleanuplo, n, A, ldA, info); +} + + +/** zlauum's recursive compute kernel */ +static void RELAPACK_zlauum_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) { + // Unblocked + LAPACK(zlauu2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_zlauum_rec(uplo, &n1, A_TL, ldA, info); + + if (*uplo == 'L') { + // A_TL = A_TL + A_BL' * A_BL + BLAS(zherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); + // A_BL = A_BR' * A_BL + BLAS(ztrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TL = A_TL + A_TR * A_TR' + BLAS(zherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); + // A_TR = A_TR * A_BR' + BLAS(ztrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_zlauum_rec(uplo, &n2, A_BR, ldA, info); +} diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c new file mode 100644 index 000000000..37e711c9d --- /dev/null +++ b/relapack/src/zpbtrf.c @@ -0,0 +1,157 @@ +#include "relapack.h" +#include "stdlib.h" + +static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, + double *, const int *, double *, const int *, int *); + + +/** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. + * + * This routine is functionally equivalent to LAPACK's zpbtrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html + * */ +void RELAPACK_zpbtrf( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*kd < 0) + *info = -3; + else if (*ldAb < *kd + 1) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZPBTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Constant + const double ZERO[] = { 0., 0. }; + + // Allocate work space + const int n1 = ZREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + double *Work = malloc(mWork * nWork * 2 * sizeof(double)); + LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); + + // Recursive kernel + RELAPACK_zpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); + + // Free work space + free(Work); +} + + +/** zpbtrf's recursive compute kernel */ +static void RELAPACK_zpbtrf_rec( + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + double *Work, const int *ldWork, + int *info +){ + + if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { + // Unblocked + LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + + // Unskew A + const int ldA[] = { *ldAb - 1 }; + double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); + + // Splitting + const int n1 = MIN(ZREC_SPLIT(*n), *kd); + const int n2 = *n - n1; + + // * * + // * Ab_BR + double *const Ab_BR = Ab + 2 * *ldAb * n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_zpotrf(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + // Banded splitting + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); + + // n1 n21 n22 + // n1 * A_TRl A_TRr + // n21 A_BLt A_BRtl A_BRtr + // n22 A_BLb A_BRbl A_BRbr + double *const A_TRl = A_TR; + double *const A_TRr = A_TR + 2 * *ldA * n21; + double *const A_BLt = A_BL; + double *const A_BLb = A_BL + 2 * n21; + double *const A_BRtl = A_BR; + double *const A_BRtr = A_BR + 2 * *ldA * n21; + double *const A_BRbl = A_BR + 2 * n21; + double *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; + + if (*uplo == 'L') { + // A_BLt = ABLt / A_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); + // A_BRtl = A_BRtl - A_BLt * A_BLt' + BLAS(zherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); + // Work = A_BLb + LAPACK(zlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); + // Work = Work / A_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); + // A_BRbl = A_BRbl - Work * A_BLt' + BLAS(zgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); + // A_BRbr = A_BRbr - Work * Work' + BLAS(zherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_BLb = Work + LAPACK(zlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); + } else { + // A_TRl = A_TL' \ A_TRl + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + // A_BRtl = A_BRtl - A_TRl' * A_TRl + BLAS(zherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); + // Work = A_TRr + LAPACK(zlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); + // Work = A_TL' \ Work + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); + // A_BRtr = A_BRtr - A_TRl' * Work + BLAS(zgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); + // A_BRbr = A_BRbr - Work' * Work + BLAS(zherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); + // A_TRr = Work + LAPACK(zlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); + } + + // recursion(A_BR) + if (*kd > n1) + RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info); + else + RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c new file mode 100644 index 000000000..411ac5fc0 --- /dev/null +++ b/relapack/src/zpotrf.c @@ -0,0 +1,92 @@ +#include "relapack.h" + +static void RELAPACK_zpotrf_rec(const char *, const int *, double *, + const int *, int *); + + +/** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. + * + * This routine is functionally equivalent to LAPACK's zpotrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html + * */ +void RELAPACK_zpotrf( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZPOTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Recursive kernel + RELAPACK_zpotrf_rec(&cleanuplo, n, A, ldA, info); +} + + +/** zpotrf's recursive compute kernel */ +static void RELAPACK_zpotrf_rec( + const char *uplo, const int *n, + double *A, const int *ldA, + int *info +) { + + if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) { + // Unblocked + LAPACK(zpotf2)(uplo, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_zpotrf_rec(uplo, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = A_BL / A_TL' + BLAS(ztrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); + // A_BR = A_BR - A_BL * A_BL' + BLAS(zherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); + } else { + // A_TR = A_TL' \ A_TR + BLAS(ztrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); + // A_BR = A_BR - A_TR' * A_TR + BLAS(zherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); + } + + // recursion(A_BR) + RELAPACK_zpotrf_rec(uplo, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c new file mode 100644 index 000000000..3be21563a --- /dev/null +++ b/relapack/src/zsytrf.c @@ -0,0 +1,238 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zsytrf. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html + * */ +void RELAPACK_zsytrf( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy arguments + int nout; + + // Recursive kernel + RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zsytrf's recursive compute kernel */ +static void RELAPACK_zsytrf_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) { + // Unblocked + if (top) { + LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zsytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Loop iterator + int i; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c new file mode 100644 index 000000000..33902ee9e --- /dev/null +++ b/relapack/src/zsytrf_rec2.c @@ -0,0 +1,452 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. + * + * This routine is a minor modification of LAPACK's zlasyf. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int * + nb, int *kb, doublecomplex *a, int *lda, int *ipiv, + doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2, d__3, d__4; + doublecomplex z__1, z__2, z__3; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void z_div(doublecomplex *, doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k; + static doublecomplex t, r1, d11, d21, d22; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int zscal_(int *, doublecomplex *, + doublecomplex *, int *); + static int kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + static double absakk, colmax; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + kstep = 1; + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * + w_dim1]), abs(d__2)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], + &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (kw - 1) * w_dim1]), abs(d__2)); + if (imax > 1) { + i__1 = imax - 1; + jmax = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); +/* Computing MAX */ + i__1 = jmax + (kw - 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (kw - 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (kw - 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + imax + (kw - 1) * w_dim1]), abs(d__2)) >= alpha * + rowmax) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kk - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + if (kp > 1) { + i__1 = kp - 1; + zcopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + + 1], &c__1); + } + if (k < *n) { + i__1 = *n - k; + zswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k + + 1) * a_dim1], lda); + } + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = k - 1; + zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + kw * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + z_div(&z__1, &t, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + ++j; + } + ++j; + if (jp != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); + } + if (j < *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + i__1 = *n - k + 1; + zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, &w[k + + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); + kstep = 1; + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * + w_dim1]), abs(d__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + } else { + if (absakk >= alpha * colmax) { + kp = k; + } else { + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], + lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * + w_dim1], &c__1, (ftnlen)12); + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) + ; + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + jmax + (k + 1) * w_dim1]), abs(d__2)); + if (imax < *n) { + i__1 = *n - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); +/* Computing MAX */ + i__1 = jmax + (k + 1) * w_dim1; + d__3 = rowmax, d__4 = (d__1 = w[i__1].r, abs(d__1)) + ( + d__2 = d_imag(&w[jmax + (k + 1) * w_dim1]), abs( + d__2)); + rowmax = max(d__3,d__4); + } + if (absakk >= alpha * colmax * (colmax / rowmax)) { + kp = k; + } else /* if(complicated condition) */ { + i__1 = imax + (k + 1) * w_dim1; + if ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + imax + (k + 1) * w_dim1]), abs(d__2)) >= alpha * + rowmax) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + + k * w_dim1], &c__1); + } else { + kp = imax; + kstep = 2; + } + } + } + kk = k + kstep - 1; + if (kp != kk) { + i__1 = kp + kp * a_dim1; + i__2 = kk + kk * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - kk - 1; + zcopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + + 1) * a_dim1], lda); + if (kp < *n) { + i__1 = *n - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 + + kp * a_dim1], &c__1); + } + if (k > 1) { + i__1 = k - 1; + zswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + } + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = *n - k; + zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k + k * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + z_div(&z__1, &t, &d21); + d21.r = z__1.r, d21.i = z__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + z__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__3.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__3.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__2.r = z__3.r - w[i__4].r, z__2.i = z__3.i - w[i__4] + .i; + z__1.r = d21.r * z__2.r - d21.i * z__2.i, z__1.i = + d21.r * z__2.i + d21.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -kp; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + jj = j; + jp = ipiv[j]; + if (jp < 0) { + jp = -jp; + --j; + } + --j; + if (jp != jj && j >= 1) { + zswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j > 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c new file mode 100644 index 000000000..c598f7b1e --- /dev/null +++ b/relapack/src/zsytrf_rook.c @@ -0,0 +1,236 @@ +#include "relapack.h" +#if XSYTRF_ALLOW_MALLOC +#include <stdlib.h> +#endif + +static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); + + +/** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. + * + * This routine is functionally equivalent to LAPACK's zsytrf_rook. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html + * */ +void RELAPACK_zsytrf_rook( + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info +) { + + // Required work size + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; +#if XSYTRF_ALLOW_MALLOC + minlWork = 1; +#endif + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (*n < 0) + *info = -2; + else if (*ldA < MAX(1, *n)) + *info = -4; + else if (*lWork < minlWork && *lWork != -1) + *info = -7; + else if (*lWork == -1) { + // Work size query + *Work = cleanlWork; + return; + } + + // Ensure Work size + double *cleanWork = Work; +#if XSYTRF_ALLOW_MALLOC + if (!*info && *lWork < cleanlWork) { + cleanWork = malloc(cleanlWork * 2 * sizeof(double)); + if (!cleanWork) + *info = -7; + } +#endif + + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + + // Dummy argument + int nout; + + // Recursive kernel + RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); + +#if XSYTRF_ALLOW_MALLOC + if (cleanWork != Work) + free(cleanWork); +#endif +} + + +/** zsytrf_rook's recursive compute kernel */ +static void RELAPACK_zsytrf_rook_rec( + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info +) { + + // top recursion level? + const int top = *n_full == *n; + + if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) { + // Unblocked + if (top) { + LAPACK(zsytf2)(uplo, n, A, ldA, ipiv, info); + *n_out = *n; + } else + RELAPACK_zsytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); + return; + } + + int info1, info2; + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + const int n_rest = *n_full - *n; + + if (*uplo == 'L') { + // Splitting (setup) + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; + + // Work_L * + double *const Work_L = Work; + + // recursion(A_L) + int n1_out; + RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); + n1 = n1_out; + + // Splitting (continued) + n2 = *n - n1; + const int n_full2 = *n_full - n1; + + // * * + // A_BL A_BR + // A_BL_B A_BR_B + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + double *const A_BL_B = A + 2 * *n; + double *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; + + // * * + // Work_BL Work_BR + // * * + // (top recursion level: use Work as Work_BR) + double *const Work_BL = Work + 2 * n1; + double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; + const int ldWork_BR = top ? n2 : *ldWork; + + // ipiv_T + // ipiv_B + int *const ipiv_B = ipiv + n1; + + // A_BR = A_BR - A_BL Work_BL' + RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); + + // recursion(A_BR) + int n2_out; + RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); + + if (n2_out != n2) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // last column of A_BR + double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; + + // last row of A_BL + double *const A_BL_b = A_BL + 2 * n2_out; + + // last row of Work_BL + double *const Work_BL_b = Work_BL + 2 * n2_out; + + // A_BR_r = A_BR_r + A_BL_b Work_BL_b' + BLAS(zgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); + } + n2 = n2_out; + + // shift pivots + int i; + for (i = 0; i < n2; i++) + if (ipiv_B[i] > 0) + ipiv_B[i] += n1; + else + ipiv_B[i] -= n1; + + *info = info1 || info2; + *n_out = n1 + n2; + } else { + // Splitting (setup) + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; + + // * Work_R + // (top recursion level: use Work as Work_R) + double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; + + // recursion(A_R) + int n2_out; + RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); + const int n2_diff = n2 - n2_out; + n2 = n2_out; + + // Splitting (continued) + n1 = *n - n2; + const int n_full1 = *n_full - n2; + + // * A_TL_T A_TR_T + // * A_TL A_TR + // * * * + double *const A_TL_T = A + 2 * *ldA * n_rest; + double *const A_TR_T = A + 2 * *ldA * (n_rest + n1); + double *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; + double *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; + + // Work_L * + // * Work_TR + // * * + // (top recursion level: Work_R was Work) + double *const Work_L = Work; + double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; + const int ldWork_L = top ? n1 : *ldWork; + + // A_TL = A_TL - A_TR Work_TR' + RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); + BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); + + // recursion(A_TL) + int n1_out; + RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); + + if (n1_out != n1) { + // undo 1 column of updates + const int n_restp1 = n_rest + 1; + + // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' + BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); + } + n1 = n1_out; + + *info = info2 || info1; + *n_out = n1 + n2; + } +} diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c new file mode 100644 index 000000000..9e111fe0c --- /dev/null +++ b/relapack/src/zsytrf_rook_rec2.c @@ -0,0 +1,561 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "f2c.h" + +/* Table of constant values */ + +static doublecomplex c_b1 = {1.,0.}; +static int c__1 = 1; + +/** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. + * + * This routine is a minor modification of LAPACK's zlasyf_rook. + * It serves as an unblocked kernel in the recursive algorithms. + * The blocked BLAS Level 3 updates were removed and moved to the + * recursive algorithm. + * */ +/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, doublecomplex *a, int *lda, int * + ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) +{ + /* System generated locals */ + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + double d__1, d__2; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + double sqrt(double), d_imag(doublecomplex *); + void z_div(doublecomplex *, doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k, p; + static doublecomplex t, r1, d11, d12, d21, d22; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; + static logical done; + static int imax, jmax; + static double alpha; + extern logical lsame_(char *, char *, ftnlen, ftnlen); + static double dtemp, sfmin; + extern /* Subroutine */ int zscal_(int *, doublecomplex *, + doublecomplex *, int *); + static int itemp, kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); + extern double dlamch_(char *, ftnlen); + static double absakk, colmax; + extern int izamax_(int *, doublecomplex *, int *); + static double rowmax; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --ipiv; + w_dim1 = *ldw; + w_offset = 1 + w_dim1; + w -= w_offset; + + /* Function Body */ + *info = 0; + alpha = (sqrt(17.) + 1.) / 8.; + sfmin = dlamch_("S", (ftnlen)1); + if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { + k = *n; +L10: + kw = *nb + k - *n; + if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { + goto L30; + } + kstep = 1; + p = k; + zcopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * a_dim1 + 1], + lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * + w_dim1 + 1], &c__1, (ftnlen)12); + } + i__1 = k + kw * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + kw * + w_dim1]), abs(d__2)); + if (k > 1) { + i__1 = k - 1; + imax = izamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); + i__1 = imax + kw * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + kw * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L12: + zcopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * + w_dim1 + 1], &c__1); + i__1 = k - imax; + zcopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + + 1 + (kw - 1) * w_dim1], &c__1); + if (k < *n) { + i__1 = *n - k; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &k, &i__1, &z__1, &a[(k + 1) * + a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], + ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( + ftnlen)12); + } + if (imax != k) { + i__1 = k - imax; + jmax = imax + izamax_(&i__1, &w[imax + 1 + (kw - 1) * + w_dim1], &c__1); + i__1 = jmax + (kw - 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (kw - 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax > 1) { + i__1 = imax - 1; + itemp = izamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); + i__1 = itemp + (kw - 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (kw - 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (kw - 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + (kw - 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { + kp = imax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + zcopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * + w_dim1 + 1], &c__1); + } + if (! done) { + goto L12; + } + } + kk = k - kstep + 1; + kkw = *nb + kk - *n; + if (kstep == 2 && p != k) { + i__1 = k - p; + zcopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * + a_dim1], lda); + zcopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & + c__1); + i__1 = *n - k + 1; + zswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], + lda); + i__1 = *n - kk + 1; + zswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], + ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = k - 1 - kp; + zcopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + + 1) * a_dim1], lda); + zcopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & + c__1); + i__1 = *n - kk + 1; + zswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], + lda); + i__1 = *n - kk + 1; + zswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * + w_dim1], ldw); + } + if (kstep == 1) { + zcopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & + c__1); + if (k > 1) { + i__1 = k + k * a_dim1; + if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + + k * a_dim1]), abs(d__2)) >= sfmin) { + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = k - 1; + zscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0. || a[i__1].i != 0.) { + i__1 = k - 1; + for (ii = 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L14: */ + } + } + } + } + } else { + if (k > 2) { + i__1 = k - 1 + kw * w_dim1; + d12.r = w[i__1].r, d12.i = w[i__1].i; + z_div(&z__1, &w[k + kw * w_dim1], &d12); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + i__1 = k - 2; + for (j = 1; j <= i__1; ++j) { + i__2 = j + (k - 1) * a_dim1; + i__3 = j + (kw - 1) * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + kw * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d12); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + k * a_dim1; + i__3 = j + kw * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + (kw - 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d12); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L20: */ + } + } + i__1 = k - 1 + (k - 1) * a_dim1; + i__2 = k - 1 + (kw - 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k - 1 + k * a_dim1; + i__2 = k - 1 + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + k * a_dim1; + i__2 = k + kw * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k - 1] = -kp; + } + k -= kstep; + goto L10; +L30: + j = k + 1; +L60: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + ++j; + jp1 = -ipiv[j]; + kstep = 2; + } + ++j; + if (jp2 != jj && j <= *n) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + jj = j - 1; + if (jp1 != jj && kstep == 2) { + i__1 = *n - j + 1; + zswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) + ; + } + if (j <= *n) { + goto L60; + } + *kb = *n - k; + } else { + k = 1; +L70: + if ((k >= *nb && *nb < *n) || k > *n) { + goto L90; + } + kstep = 1; + p = k; + i__1 = *n - k + 1; + zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1], lda, & + w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( + ftnlen)12); + } + i__1 = k + k * w_dim1; + absakk = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[k + k * + w_dim1]), abs(d__2)); + if (k < *n) { + i__1 = *n - k; + imax = k + izamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); + i__1 = imax + k * w_dim1; + colmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + k * w_dim1]), abs(d__2)); + } else { + colmax = 0.; + } + if (max(absakk,colmax) == 0.) { + if (*info == 0) { + *info = k; + } + kp = k; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + } else { + if (! (absakk < alpha * colmax)) { + kp = k; + } else { + done = FALSE_; +L72: + i__1 = imax - k; + zcopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * + w_dim1], &c__1); + i__1 = *n - imax + 1; + zcopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + + 1) * w_dim1], &c__1); + if (k > 1) { + i__1 = *n - k + 1; + i__2 = k - 1; + z__1.r = -1., z__1.i = -0.; + zgemv_("No transpose", &i__1, &i__2, &z__1, &a[k + a_dim1] + , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + + 1) * w_dim1], &c__1, (ftnlen)12); + } + if (imax != k) { + i__1 = imax - k; + jmax = k - 1 + izamax_(&i__1, &w[k + (k + 1) * w_dim1], & + c__1); + i__1 = jmax + (k + 1) * w_dim1; + rowmax = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(& + w[jmax + (k + 1) * w_dim1]), abs(d__2)); + } else { + rowmax = 0.; + } + if (imax < *n) { + i__1 = *n - imax; + itemp = imax + izamax_(&i__1, &w[imax + 1 + (k + 1) * + w_dim1], &c__1); + i__1 = itemp + (k + 1) * w_dim1; + dtemp = (d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[ + itemp + (k + 1) * w_dim1]), abs(d__2)); + if (dtemp > rowmax) { + rowmax = dtemp; + jmax = itemp; + } + } + i__1 = imax + (k + 1) * w_dim1; + if (! ((d__1 = w[i__1].r, abs(d__1)) + (d__2 = d_imag(&w[imax + + (k + 1) * w_dim1]), abs(d__2)) < alpha * rowmax)) { + kp = imax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + done = TRUE_; + } else if (p == jmax || rowmax <= colmax) { + kp = imax; + kstep = 2; + done = TRUE_; + } else { + p = imax; + colmax = rowmax; + imax = jmax; + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * + w_dim1], &c__1); + } + if (! done) { + goto L72; + } + } + kk = k + kstep - 1; + if (kstep == 2 && p != k) { + i__1 = p - k; + zcopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], + lda); + i__1 = *n - p + 1; + zcopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & + c__1); + zswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); + zswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); + } + if (kp != kk) { + i__1 = kp + k * a_dim1; + i__2 = kk + k * a_dim1; + a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; + i__1 = kp - k - 1; + zcopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) + * a_dim1], lda); + i__1 = *n - kp + 1; + zcopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * + a_dim1], &c__1); + zswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); + zswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); + } + if (kstep == 1) { + i__1 = *n - k + 1; + zcopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & + c__1); + if (k < *n) { + i__1 = k + k * a_dim1; + if ((d__1 = a[i__1].r, abs(d__1)) + (d__2 = d_imag(&a[k + + k * a_dim1]), abs(d__2)) >= sfmin) { + z_div(&z__1, &c_b1, &a[k + k * a_dim1]); + r1.r = z__1.r, r1.i = z__1.i; + i__1 = *n - k; + zscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); + } else /* if(complicated condition) */ { + i__1 = k + k * a_dim1; + if (a[i__1].r != 0. || a[i__1].i != 0.) { + i__1 = *n; + for (ii = k + 1; ii <= i__1; ++ii) { + i__2 = ii + k * a_dim1; + z_div(&z__1, &a[ii + k * a_dim1], &a[k + k * + a_dim1]); + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L74: */ + } + } + } + } + } else { + if (k < *n - 1) { + i__1 = k + 1 + k * w_dim1; + d21.r = w[i__1].r, d21.i = w[i__1].i; + z_div(&z__1, &w[k + 1 + (k + 1) * w_dim1], &d21); + d11.r = z__1.r, d11.i = z__1.i; + z_div(&z__1, &w[k + k * w_dim1], &d21); + d22.r = z__1.r, d22.i = z__1.i; + z__3.r = d11.r * d22.r - d11.i * d22.i, z__3.i = d11.r * + d22.i + d11.i * d22.r; + z__2.r = z__3.r - 1., z__2.i = z__3.i - 0.; + z_div(&z__1, &c_b1, &z__2); + t.r = z__1.r, t.i = z__1.i; + i__1 = *n; + for (j = k + 2; j <= i__1; ++j) { + i__2 = j + k * a_dim1; + i__3 = j + k * w_dim1; + z__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, + z__4.i = d11.r * w[i__3].i + d11.i * w[i__3] + .r; + i__4 = j + (k + 1) * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; + i__2 = j + (k + 1) * a_dim1; + i__3 = j + (k + 1) * w_dim1; + z__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, + z__4.i = d22.r * w[i__3].i + d22.i * w[i__3] + .r; + i__4 = j + k * w_dim1; + z__3.r = z__4.r - w[i__4].r, z__3.i = z__4.i - w[i__4] + .i; + z_div(&z__2, &z__3, &d21); + z__1.r = t.r * z__2.r - t.i * z__2.i, z__1.i = t.r * + z__2.i + t.i * z__2.r; + a[i__2].r = z__1.r, a[i__2].i = z__1.i; +/* L80: */ + } + } + i__1 = k + k * a_dim1; + i__2 = k + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + k * a_dim1; + i__2 = k + 1 + k * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + i__1 = k + 1 + (k + 1) * a_dim1; + i__2 = k + 1 + (k + 1) * w_dim1; + a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; + } + } + if (kstep == 1) { + ipiv[k] = kp; + } else { + ipiv[k] = -p; + ipiv[k + 1] = -kp; + } + k += kstep; + goto L70; +L90: + j = k - 1; +L120: + kstep = 1; + jp1 = 1; + jj = j; + jp2 = ipiv[j]; + if (jp2 < 0) { + jp2 = -jp2; + --j; + jp1 = -ipiv[j]; + kstep = 2; + } + --j; + if (jp2 != jj && j >= 1) { + zswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); + } + jj = j + 1; + if (jp1 != jj && kstep == 2) { + zswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); + } + if (j >= 1) { + goto L120; + } + *kb = k - 1; + } + return; +} diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c new file mode 100644 index 000000000..2c8a35256 --- /dev/null +++ b/relapack/src/ztgsyl.c @@ -0,0 +1,268 @@ +#include "relapack.h" +#include <math.h> + +static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, + const int *, const double *, const int *, const double *, const int *, + double *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, double *, double *, int *); + + +/** ZTGSYL solves the generalized Sylvester equation. + * + * This routine is functionally equivalent to LAPACK's ztgsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html + * */ +void RELAPACK_ztgsyl( + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dif, + double *Work, const int *lWork, int *iWork, int *info +) { + + // Parse arguments + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "C"); + + // Compute work buffer size + int lwmin = 1; + if (notran && (*ijob == 1 || *ijob == 2)) + lwmin = MAX(1, 2 * *m * *n); + *info = 0; + + // Check arguments + if (!tran && !notran) + *info = -1; + else if (notran && (*ijob < 0 || *ijob > 4)) + *info = -2; + else if (*m <= 0) + *info = -3; + else if (*n <= 0) + *info = -4; + else if (*ldA < MAX(1, *m)) + *info = -6; + else if (*ldB < MAX(1, *n)) + *info = -8; + else if (*ldC < MAX(1, *m)) + *info = -10; + else if (*ldD < MAX(1, *m)) + *info = -12; + else if (*ldE < MAX(1, *n)) + *info = -14; + else if (*ldF < MAX(1, *m)) + *info = -16; + else if (*lWork < lwmin && *lWork != -1) + *info = -20; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZTGSYL", &minfo); + return; + } + + if (*lWork == -1) { + // Work size query + *Work = lwmin; + return; + } + + // Clean char * arguments + const char cleantrans = notran ? 'N' : 'C'; + + // Constant + const double ZERO[] = { 0., 0. }; + + int isolve = 1; + int ifunc = 0; + if (notran) { + if (*ijob >= 3) { + ifunc = *ijob - 2; + LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else if (*ijob >= 1) + isolve = 2; + } + + double scale2; + int iround; + for (iround = 1; iround <= isolve; iround++) { + *scale = 1; + double dscale = 0; + double dsum = 1; + RELAPACK_ztgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); + if (dscale != 0) { + if (*ijob == 1 || *ijob == 3) + *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); + else + *dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); + } + if (isolve == 2) { + if (iround == 1) { + if (notran) + ifunc = *ijob; + scale2 = *scale; + LAPACK(zlacpy)("F", m, n, C, ldC, Work, m); + LAPACK(zlacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); + LAPACK(zlaset)("F", m, n, ZERO, ZERO, C, ldC); + LAPACK(zlaset)("F", m, n, ZERO, ZERO, F, ldF); + } else { + LAPACK(zlacpy)("F", m, n, Work, m, C, ldC); + LAPACK(zlacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); + *scale = scale2; + } + } + } +} + + +/** ztgsyl's recursive vompute kernel */ +static void RELAPACK_ztgsyl_rec( + const char *trans, const int *ifunc, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, + double *scale, double *dsum, double *dscale, + int *info +) { + + if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) { + // Unblocked + LAPACK(ztgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1., 0. }; + double scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = ZREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + 2 * *ldA * m1; + const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + 2 * m1; + + // D_TL D_TR + // 0 D_BR + const double *const D_TL = D; + const double *const D_TR = D + 2 * *ldD * m1; + const double *const D_BR = D + 2 * *ldD * m1 + 2 * m1; + + // F_T + // F_B + double *const F_T = F; + double *const F_B = F + 2 * m1; + + if (*trans == 'N') { + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); + // C_T = C_T - A_TR * C_B + BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // F_T = F_T - D_TR * C_B + BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); + } + } else { + // recursion(A_TL, B, C_T, D_TL, E, F_T) + RELAPACK_ztgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); + // C_B = C_B - A_TR^H * C_T + BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // C_B = C_B - D_TR^H * F_T + BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); + // recursion(A_BR, B, C_B, D_BR, E, F_B) + RELAPACK_ztgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); + } + } + } else { + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + 2 * *ldB * n1; + const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + 2 * *ldC * n1; + + // E_TL E_TR + // 0 E_BR + const double *const E_TL = E; + const double *const E_TR = E + 2 * *ldE * n1; + const double *const E_BR = E + 2 * *ldE * n1 + 2 * n1; + + // F_L F_R + double *const F_L = F; + double *const F_R = F + 2 * *ldF * n1; + + if (*trans == 'N') { + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); + // C_R = C_R + F_L * B_TR + BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); + // F_R = F_R + F_L * E_TR + BLAS(zgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); + } + } else { + // recursion(A, B_BR, C_R, D, E_BR, F_R) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); + // apply scale + if (scale1[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); + // F_L = F_L + C_R * B_TR + BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); + // F_L = F_L + F_R * E_TR + BLAS(zgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); + // recursion(A, B_TL, C_L, D, E_TL, F_L) + RELAPACK_ztgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); + // apply scale + if (scale2[0] != 1) { + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); + } + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c new file mode 100644 index 000000000..82b2c8803 --- /dev/null +++ b/relapack/src/ztrsyl.c @@ -0,0 +1,163 @@ +#include "relapack.h" + +static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, int *); + + +/** ZTRSYL solves the complex Sylvester matrix equation. + * + * This routine is functionally equivalent to LAPACK's ztrsyl. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html + * */ +void RELAPACK_ztrsyl( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + // Check arguments + const int notransA = LAPACK(lsame)(tranA, "N"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int ctransB = LAPACK(lsame)(tranB, "C"); + *info = 0; + if (!ctransA && !notransA) + *info = -1; + else if (!ctransB && !notransB) + *info = -2; + else if (*isgn != 1 && *isgn != -1) + *info = -3; + else if (*m < 0) + *info = -4; + else if (*n < 0) + *info = -5; + else if (*ldA < MAX(1, *m)) + *info = -7; + else if (*ldB < MAX(1, *n)) + *info = -9; + else if (*ldC < MAX(1, *m)) + *info = -11; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZTRSYL", &minfo); + return; + } + + // Clean char * arguments + const char cleantranA = notransA ? 'N' : 'C'; + const char cleantranB = notransB ? 'N' : 'C'; + + // Recursive kernel + RELAPACK_ztrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); +} + + +/** ztrsyl's recursive compute kernel */ +static void RELAPACK_ztrsyl_rec( + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info +) { + + if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) { + // Unblocked + RELAPACK_ztrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); + return; + } + + // Constants + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1., 0. }; + const double MSGN[] = { -*isgn, 0. }; + const int iONE[] = { 1 }; + + // Outputs + double scale1[] = { 1., 0. }; + double scale2[] = { 1., 0. }; + int info1[] = { 0 }; + int info2[] = { 0 }; + + if (*m > *n) { + // Splitting + const int m1 = ZREC_SPLIT(*m); + const int m2 = *m - m1; + + // A_TL A_TR + // 0 A_BR + const double *const A_TL = A; + const double *const A_TR = A + 2 * *ldA * m1; + const double *const A_BR = A + 2 * *ldA * m1 + 2 * m1; + + // C_T + // C_B + double *const C_T = C; + double *const C_B = C + 2 * m1; + + if (*tranA == 'N') { + // recusion(A_BR, B, C_B) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); + // C_T = C_T - A_TR * C_B + BLAS(zgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); + // recusion(A_TL, B, C_T) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); + } else { + // recusion(A_TL, B, C_T) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); + // C_B = C_B - A_TR' * C_T + BLAS(zgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); + // recusion(A_BR, B, C_B) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); + } + } else { + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // B_TL B_TR + // 0 B_BR + const double *const B_TL = B; + const double *const B_TR = B + 2 * *ldB * n1; + const double *const B_BR = B + 2 * *ldB * n1 + 2 * n1; + + // C_L C_R + double *const C_L = C; + double *const C_R = C + 2 * *ldC * n1; + + if (*tranB == 'N') { + // recusion(A, B_TL, C_L) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); + // C_R = C_R -/+ C_L * B_TR + BLAS(zgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); + // recusion(A, B_BR, C_R) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); + } else { + // recusion(A, B_BR, C_R) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); + // C_L = C_L -/+ C_R * B_TR' + BLAS(zgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); + // recusion(A, B_TL, C_L) + RELAPACK_ztrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); + // apply scale + if (scale2[0] != 1) + LAPACK(zlascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); + } + } + + *scale = scale1[0] * scale2[0]; + *info = info1[0] || info2[0]; +} diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c new file mode 100644 index 000000000..526ab097c --- /dev/null +++ b/relapack/src/ztrsyl_rec2.c @@ -0,0 +1,394 @@ +/* -- translated by f2c (version 20100827). + You must link the resulting object file with libf2c: + on Microsoft Windows system, link with libf2c.lib; + on Linux or Unix systems, link with .../path/to/libf2c.a -lm + or, if you install libf2c.a in a standard place, with -lf2c -lm + -- in that order, at the end of the command line, as in + cc *.o -lf2c -lm + Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., + + http://www.netlib.org/f2c/libf2c.zip +*/ + +#include "../config.h" +#include "f2c.h" + +#if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { + extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); + doublecomplex result; + zdotu_(&result, n, x, incx, y, incy); + return result; +} +#define zdotu_ zdotu_fun + +doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { + extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); + doublecomplex result; + zdotc_(&result, n, x, incx, y, incy); + return result; +} +#define zdotc_ zdotc_fun +#endif + +#if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES +doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) { + extern void zladiv_(doublecomplex *, doublecomplex *, doublecomplex *); + doublecomplex result; + zladiv_(&result, a, b); + return result; +} +#define zladiv_ zladiv_fun +#endif + +/* Table of constant values */ + +static int c__1 = 1; + +/** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) + * + * This routine is an exact copy of LAPACK's ztrsyl. + * It serves as an unblocked kernel in the recursive algorithms. + * */ +/* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int + *isgn, int *m, int *n, doublecomplex *a, int *lda, + doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc, + double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) +{ + /* System generated locals */ + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + i__3, i__4; + double d__1, d__2; + doublecomplex z__1, z__2, z__3, z__4; + + /* Builtin functions */ + double d_imag(doublecomplex *); + void d_cnjg(doublecomplex *, doublecomplex *); + + /* Local variables */ + static int j, k, l; + static doublecomplex a11; + static double db; + static doublecomplex x11; + static double da11; + static doublecomplex vec; + static double dum[1], eps, sgn, smin; + static doublecomplex suml, sumr; + extern int lsame_(char *, char *, ftnlen, ftnlen); + /* Double Complex */ doublecomplex zdotc_(int *, + doublecomplex *, int *, doublecomplex *, int *), zdotu_( + int *, doublecomplex *, int *, + doublecomplex *, int *); + extern /* Subroutine */ int dlabad_(double *, double *); + extern double dlamch_(char *, ftnlen); + static double scaloc; + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + extern double zlange_(char *, int *, int *, doublecomplex *, + int *, double *, ftnlen); + static double bignum; + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); + /* Double Complex */ doublecomplex zladiv_(doublecomplex *, + doublecomplex *); + static int notrna, notrnb; + static double smlnum; + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + c_dim1 = *ldc; + c_offset = 1 + c_dim1; + c__ -= c_offset; + + /* Function Body */ + notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); + notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); + *info = 0; + if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { + *info = -1; + } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { + *info = -2; + } else if (*isgn != 1 && *isgn != -1) { + *info = -3; + } else if (*m < 0) { + *info = -4; + } else if (*n < 0) { + *info = -5; + } else if (*lda < max(1,*m)) { + *info = -7; + } else if (*ldb < max(1,*n)) { + *info = -9; + } else if (*ldc < max(1,*m)) { + *info = -11; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("ZTRSY2", &i__1, (ftnlen)6); + return; + } + *scale = 1.; + if (*m == 0 || *n == 0) { + return; + } + eps = dlamch_("P", (ftnlen)1); + smlnum = dlamch_("S", (ftnlen)1); + bignum = 1. / smlnum; + dlabad_(&smlnum, &bignum); + smlnum = smlnum * (double) (*m * *n) / eps; + bignum = 1. / smlnum; +/* Computing MAX */ + d__1 = smlnum, d__2 = eps * zlange_("M", m, m, &a[a_offset], lda, dum, ( + ftnlen)1), d__1 = max(d__1,d__2), d__2 = eps * zlange_("M", n, n, + &b[b_offset], ldb, dum, (ftnlen)1); + smin = max(d__1,d__2); + sgn = (double) (*isgn); + if (notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + for (k = *m; k >= 1; --k) { + i__2 = *m - k; +/* Computing MIN */ + i__3 = k + 1; +/* Computing MIN */ + i__4 = k + 1; + z__1 = zdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ + min(i__4,*m) + l * c_dim1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__2 = l - 1; + z__1 = zdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = z__1.r, sumr.i = z__1.i; + i__2 = k + l * c_dim1; + z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + z__2.r = sgn * b[i__3].r, z__2.i = sgn * b[i__3].i; + z__1.r = a[i__2].r + z__2.r, z__1.i = a[i__2].i + z__2.i; + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L10: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L20: */ + } +/* L30: */ + } + } else if (! notrna && notrnb) { + i__1 = *n; + for (l = 1; l <= i__1; ++l) { + i__2 = *m; + for (k = 1; k <= i__2; ++k) { + i__3 = k - 1; + z__1 = zdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__3 = l - 1; + z__1 = zdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] + , &c__1); + sumr.r = z__1.r, sumr.i = z__1.i; + i__3 = k + l * c_dim1; + z__3.r = sgn * sumr.r, z__3.i = sgn * sumr.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__3].r - z__2.r, z__1.i = c__[i__3].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + d_cnjg(&z__2, &a[k + k * a_dim1]); + i__3 = l + l * b_dim1; + z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; + z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i; + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__3 = *n; + for (j = 1; j <= i__3; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L40: */ + } + *scale *= scaloc; + } + i__3 = k + l * c_dim1; + c__[i__3].r = x11.r, c__[i__3].i = x11.i; +/* L50: */ + } +/* L60: */ + } + } else if (! notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + i__1 = *m; + for (k = 1; k <= i__1; ++k) { + i__2 = k - 1; + z__1 = zdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * + c_dim1 + 1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__2 = *n - l; +/* Computing MIN */ + i__3 = l + 1; +/* Computing MIN */ + i__4 = l + 1; + z__1 = zdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ + l + min(i__4,*n) * b_dim1], ldb); + sumr.r = z__1.r, sumr.i = z__1.i; + i__2 = k + l * c_dim1; + d_cnjg(&z__4, &sumr); + z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__2].r - z__2.r, z__1.i = c__[i__2].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + i__2 = k + k * a_dim1; + i__3 = l + l * b_dim1; + z__3.r = sgn * b[i__3].r, z__3.i = sgn * b[i__3].i; + z__2.r = a[i__2].r + z__3.r, z__2.i = a[i__2].i + z__3.i; + d_cnjg(&z__1, &z__2); + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__2 = *n; + for (j = 1; j <= i__2; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L70: */ + } + *scale *= scaloc; + } + i__2 = k + l * c_dim1; + c__[i__2].r = x11.r, c__[i__2].i = x11.i; +/* L80: */ + } +/* L90: */ + } + } else if (notrna && ! notrnb) { + for (l = *n; l >= 1; --l) { + for (k = *m; k >= 1; --k) { + i__1 = *m - k; +/* Computing MIN */ + i__2 = k + 1; +/* Computing MIN */ + i__3 = k + 1; + z__1 = zdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ + min(i__3,*m) + l * c_dim1], &c__1); + suml.r = z__1.r, suml.i = z__1.i; + i__1 = *n - l; +/* Computing MIN */ + i__2 = l + 1; +/* Computing MIN */ + i__3 = l + 1; + z__1 = zdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ + l + min(i__3,*n) * b_dim1], ldb); + sumr.r = z__1.r, sumr.i = z__1.i; + i__1 = k + l * c_dim1; + d_cnjg(&z__4, &sumr); + z__3.r = sgn * z__4.r, z__3.i = sgn * z__4.i; + z__2.r = suml.r + z__3.r, z__2.i = suml.i + z__3.i; + z__1.r = c__[i__1].r - z__2.r, z__1.i = c__[i__1].i - z__2.i; + vec.r = z__1.r, vec.i = z__1.i; + scaloc = 1.; + i__1 = k + k * a_dim1; + d_cnjg(&z__3, &b[l + l * b_dim1]); + z__2.r = sgn * z__3.r, z__2.i = sgn * z__3.i; + z__1.r = a[i__1].r + z__2.r, z__1.i = a[i__1].i + z__2.i; + a11.r = z__1.r, a11.i = z__1.i; + da11 = (d__1 = a11.r, abs(d__1)) + (d__2 = d_imag(&a11), abs( + d__2)); + if (da11 <= smin) { + a11.r = smin, a11.i = 0.; + da11 = smin; + *info = 1; + } + db = (d__1 = vec.r, abs(d__1)) + (d__2 = d_imag(&vec), abs( + d__2)); + if (da11 < 1. && db > 1.) { + if (db > bignum * da11) { + scaloc = 1. / db; + } + } + z__3.r = scaloc, z__3.i = 0.; + z__2.r = vec.r * z__3.r - vec.i * z__3.i, z__2.i = vec.r * + z__3.i + vec.i * z__3.r; + z__1 = zladiv_(&z__2, &a11); + x11.r = z__1.r, x11.i = z__1.i; + if (scaloc != 1.) { + i__1 = *n; + for (j = 1; j <= i__1; ++j) { + zdscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); +/* L100: */ + } + *scale *= scaloc; + } + i__1 = k + l * c_dim1; + c__[i__1].r = x11.r, c__[i__1].i = x11.i; +/* L110: */ + } +/* L120: */ + } + } + return; +} diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c new file mode 100644 index 000000000..ac9fe7bd4 --- /dev/null +++ b/relapack/src/ztrtri.c @@ -0,0 +1,107 @@ +#include "relapack.h" + +static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, + double *, const int *, int *); + + +/** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. + * + * This routine is functionally equivalent to LAPACK's ztrtri. + * For details on its interface, see + * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html + * */ +void RELAPACK_ztrtri( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +) { + + // Check arguments + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); + *info = 0; + if (!lower && !upper) + *info = -1; + else if (!nounit && !unit) + *info = -2; + else if (*n < 0) + *info = -3; + else if (*ldA < MAX(1, *n)) + *info = -5; + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("ZTRTRI", &minfo); + return; + } + + // Clean char * arguments + const char cleanuplo = lower ? 'L' : 'U'; + const char cleandiag = nounit ? 'N' : 'U'; + + // check for singularity + if (nounit) { + int i; + for (i = 0; i < *n; i++) + if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { + *info = i; + return; + } + } + + // Recursive kernel + RELAPACK_ztrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); +} + + +/** ztrtri's recursive compute kernel */ +static void RELAPACK_ztrtri_rec( + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info +){ + + if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) { + // Unblocked + LAPACK(ztrti2)(uplo, diag, n, A, ldA, info); + return; + } + + // Constants + const double ONE[] = { 1. }; + const double MONE[] = { -1. }; + + // Splitting + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + + // A_TL A_TR + // A_BL A_BR + double *const A_TL = A; + double *const A_TR = A + 2 * *ldA * n1; + double *const A_BL = A + 2 * n1; + double *const A_BR = A + 2 * *ldA * n1 + 2 * n1; + + // recursion(A_TL) + RELAPACK_ztrtri_rec(uplo, diag, &n1, A_TL, ldA, info); + if (*info) + return; + + if (*uplo == 'L') { + // A_BL = - A_BL * A_TL + BLAS(ztrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); + // A_BL = A_BR \ A_BL + BLAS(ztrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); + } else { + // A_TR = - A_TL * A_TR + BLAS(ztrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); + // A_TR = A_TR / A_BR + BLAS(ztrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); + } + + // recursion(A_BR) + RELAPACK_ztrtri_rec(uplo, diag, &n2, A_BR, ldA, info); + if (*info) + *info += n1; +} diff --git a/relapack/test/README.md b/relapack/test/README.md new file mode 100644 index 000000000..48434b3cf --- /dev/null +++ b/relapack/test/README.md @@ -0,0 +1,48 @@ +ReLAPACK Test Suite +=================== +This test suite compares ReLAPACK's recursive routines with LAPACK's compute +routines in terms of accuracy: For each test-case, we execute both ReLAPACK's +and LAPACK's routine on the same data and consider the numerical difference +between the two solutions. + +This difference is computed as the maximum error across all elements of the +routine's outputs, where the error for each element is the minimum of the +absolute error and the relative error (with LAPACK as the reference). If the +error is below the error bound configured in `config.h` (default: 1e-5 for +single precision and 1e-14 for double precision) the test-case is considered as +passed. + +For each routine the test-cases cover a variety of input argument combinations +to ensure that ReLAPACK's routines match the functionality of LAPACK for all use +cases. + +The matrix size for all experiments (default: 100) can also be specified in +`config.h`. + + +Implementation +-------------- +`test.h` provides the framework for our tests: It provides macros that allow to +generalize the tests for each operation in one file covering all data-types. +Such a file is structured as follows: + + * All matrices required by the test-cases are declared globally. For each + matrix, an array of two pointers is declared; one for the matrix copy passed + to ReLAPACK and one passed to LAPACK. + + * `tests()` contains the main control flow: it allocates (and later frees) the + copies of the globally declared matrices. It then defines the macro + `ROUTINE` to contain the name of the currently tested routine. + It then uses the macro `TEST` to perform the test-cases. + It receives the arguments of the routine, where matrices of which ReLAPACK + and LAPACK receive a copy are index with `i`. (Example: `TEST("L", &n, A[i], + &n, info);`) + + * The macro `TEST` first calls `pre()`, which initializes all relevant + matrices, then executes the ReLAPACK algorithm on the matrices with `i` = `0` + and then the LAPACK counter part with `i` = `1`. It then calls `post()`, + which computes the difference between the results, storing it in `error`. + Finally, the error is printed out and compared to the error bound. + +If all test-cases pass the error bound test, the program will have a `0` return +value, otherwise it is `1`, indicating an error. diff --git a/relapack/test/config.h b/relapack/test/config.h new file mode 100644 index 000000000..ab06a2fff --- /dev/null +++ b/relapack/test/config.h @@ -0,0 +1,13 @@ +#ifndef TEST_CONFIG_H +#define TEST_CONFIG_H + +// error bound for single and single complex routines +#define SINGLE_ERR_BOUND 1e-4 + +// error bound for double an double complex routines +#define DOUBLE_ERR_BOUND 1e-13 + +// size of test matrices +#define TEST_SIZE 100 + +#endif /* TEST_CONFIG_H */ diff --git a/relapack/test/lapack.h b/relapack/test/lapack.h new file mode 100644 index 000000000..80f5c419e --- /dev/null +++ b/relapack/test/lapack.h @@ -0,0 +1,64 @@ +#ifndef LAPACK_H2 +#define LAPACK_H2 + +#include "../config.h" + +void LAPACK(slauum)(const char *, const int *, float *, const int *, int *); +void LAPACK(dlauum)(const char *, const int *, double *, const int *, int *); +void LAPACK(clauum)(const char *, const int *, float *, const int *, int *); +void LAPACK(zlauum)(const char *, const int *, double *, const int *, int *); + +void LAPACK(strtri)(const char *, const char *, const int *, float *, const int *, int *); +void LAPACK(dtrtri)(const char *, const char *, const int *, double *, const int *, int *); +void LAPACK(ctrtri)(const char *, const char *, const int *, float *, const int *, int *); +void LAPACK(ztrtri)(const char *, const char *, const int *, double *, const int *, int *); + +void LAPACK(spotrf)(const char *, const int *, float *, const int *, int *); +void LAPACK(dpotrf)(const char *, const int *, double *, const int *, int *); +void LAPACK(cpotrf)(const char *, const int *, float *, const int *, int *); +void LAPACK(zpotrf)(const char *, const int *, double *, const int *, int *); + +void LAPACK(spbtrf)(const char *, const int *, const int *, float *, const int *, int *); +void LAPACK(dpbtrf)(const char *, const int *, const int *, double *, const int *, int *); +void LAPACK(cpbtrf)(const char *, const int *, const int *, float *, const int *, int *); +void LAPACK(zpbtrf)(const char *, const int *, const int *, double *, const int *, int *); + +void LAPACK(ssytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(dsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(csytrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(chetrf)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(zsytrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(zhetrf)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(ssytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(dsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(csytrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(chetrf_rook)(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void LAPACK(zsytrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void LAPACK(zhetrf_rook)(const char *, const int *, double *, const int *, int *, double *, const int *, int *); + +void LAPACK(sgetrf)(const int *, const int *, float *, const int *, int *, int *); +void LAPACK(dgetrf)(const int *, const int *, double *, const int *, int *, int *); +void LAPACK(cgetrf)(const int *, const int *, float *, const int *, int *, int *); +void LAPACK(zgetrf)(const int *, const int *, double *, const int *, int *, int *); + +void LAPACK(sgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void LAPACK(dgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +void LAPACK(cgbtrf)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void LAPACK(zgbtrf)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); + +void LAPACK(ssygst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void LAPACK(dsygst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +void LAPACK(chegst)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void LAPACK(zhegst)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); + +void LAPACK(strsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void LAPACK(dtrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void LAPACK(ctrsyl)(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void LAPACK(ztrsyl)(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); + +void LAPACK(stgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void LAPACK(dtgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); +void LAPACK(ctgsyl)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void LAPACK(ztgsyl)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); + +#endif /* LAPACK_H2 */ diff --git a/relapack/test/test.h b/relapack/test/test.h new file mode 100644 index 000000000..24089f3a8 --- /dev/null +++ b/relapack/test/test.h @@ -0,0 +1,136 @@ +#ifndef TEST_H +#define TEST_H + +#include "../config.h" +#include "config.h" + +#if BLAS_UNDERSCORE +#define BLAS(routine) routine ## _ +#else +#define BLAS(routine) routine +#endif + +#if LAPACK_UNDERSCORE +#define LAPACK(routine) routine ## _ +#else +#define LAPACK(routine) routine +#endif + +#include "../inc/relapack.h" +#include "lapack.h" +#include "util.h" +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +// some name mangling macros +#define CAT(A, B) A ## B +#define XCAT(A, B) CAT(A, B) +#define XLAPACK(X) LAPACK(X) +#define XRELAPACK(X) XCAT(RELAPACK_, X) +#define STR(X) #X +#define XSTR(X) STR(X) + +// default setup and error computation names: pre() and post() +#define PRE pre +#define POST post + +// TEST macro: +// run setup (pre()), ReLAPACK routine (i = 0), LAPACK routine (i = 1), compute +// error (post()), check error bound, and print setup and error +#define TEST(...) \ + PRE(); \ + i = 0; \ + XRELAPACK(ROUTINE)(__VA_ARGS__); \ + i = 1; \ + XLAPACK(ROUTINE)(__VA_ARGS__); \ + POST(); \ + fail |= error > ERR_BOUND; \ + printf("%s(%s)\t%g\n", XSTR(ROUTINE), #__VA_ARGS__, error); + +// generalized datatype treatment: DT_PREFIX determines the type s, d, c, or z +#define XPREF(A) XCAT(DT_PREFIX, A) + +// matrix generation and error computation routines +#define x2matgen XPREF(2matgen) +#define x2vecerr XPREF(2vecerr) + +// error bounds +#define ERR_BOUND XPREF(ERR_BOUND_) +#define sERR_BOUND_ SINGLE_ERR_BOUND +#define dERR_BOUND_ DOUBLE_ERR_BOUND +#define cERR_BOUND_ SINGLE_ERR_BOUND +#define zERR_BOUND_ DOUBLE_ERR_BOUND + +// C datatypes +#define datatype XPREF(datatype_) +#define sdatatype_ float +#define ddatatype_ double +#define cdatatype_ float +#define zdatatype_ double + +// number of C datatype elements per element +#define x1 XPREF(DT_MULT) +#define sDT_MULT 1 +#define dDT_MULT 1 +#define cDT_MULT 2 +#define zDT_MULT 2 + +// typed allocations +#define xmalloc XPREF(malloc) +#define imalloc(S) malloc((S) * sizeof(int)) +#define smalloc(S) malloc((S) * sizeof(float)) +#define dmalloc(S) malloc((S) * sizeof(double)) +#define cmalloc(S) malloc((S) * 2 * sizeof(float)) +#define zmalloc(S) malloc((S) * 2 * sizeof(double)) + +// transpositions +#define xCTRANS XPREF(CTRANS) +#define sCTRANS "T" +#define dCTRANS "T" +#define cCTRANS "C" +#define zCTRANS "C" + +// some constants +#define MONE XPREF(MONE) +const float sMONE[] = { -1. }; +const double dMONE[] = { -1. }; +const float cMONE[] = { -1., 0. }; +const double zMONE[] = { -1., 0. }; + +#define ZERO XPREF(ZERO) +const float sZERO[] = { 0. }; +const double dZERO[] = { 0. }; +const float cZERO[] = { 0., 0. }; +const double zZERO[] = { 0., 0. }; + +#define ONE XPREF(ONE) +const float sONE[] = { 1. }; +const double dONE[] = { 1. }; +const float cONE[] = { 1., 0. }; +const double zONE[] = { 1., 0. }; + +const int iMONE[] = { -1 }; +const int iZERO[] = { 0 }; +const int iONE[] = { 1 }; +const int iTWO[] = { 2 }; +const int iTHREE[] = { 3 }; +const int iFOUR[] = { 4 }; + +void tests(); + +// global variables (used in tests(), pre(), and post()) +int i, n, n2, fail; +double error; + +int main(int argc, char* argv[]) { + n = TEST_SIZE; + n2 = (3 * n) / 4; + fail = 0; + + tests(); + + return fail; +} + +#endif /* TEST_H */ diff --git a/relapack/test/util.c b/relapack/test/util.c new file mode 100644 index 000000000..e0fca3eec --- /dev/null +++ b/relapack/test/util.c @@ -0,0 +1,116 @@ +#include "util.h" +#include <stdlib.h> +#include <time.h> +#include <math.h> + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/////////////////////// +// matrix generation // +/////////////////////// +// Each routine x2matgen is passed the size (m, n) of the desired matrix and +// geneartes two copies of such a matrix in in its output arguments A and B. +// The generated matrices is filled with random entries in [0, 1[ (+i*[0, 1[ in +// the complex case). Then m is added to the diagonal; this is numerically +// favorable for routines working with triangular and symmetric matrices. For +// the same reason the imaginary part of the diagonal is set to 0. + +void s2matgen(const int m, const int n, float *A, float *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + A[i + m * j] = B[i + m * j] = (float) rand() / RAND_MAX + m * (i == j); +} + +void d2matgen(const int m, const int n, double *A, double *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + A[i + m * j] = B[i + m * j] = (double) rand() / RAND_MAX + m * (i == j); +} + +void c2matgen(const int m, const int n, float *A, float *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) { + A[2* (i + m * j)] = B[2 * (i + m * j)] = (float) rand() / RAND_MAX + m * (i == j); + A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((float) rand() / RAND_MAX) * (i != j); + } +} + +void z2matgen(const int m, const int n, double *A, double *B) { + srand(time(NULL) + (size_t) A); + int i, j; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) { + A[2* (i + m * j)] = B[2 * (i + m * j)] = (double) rand() / RAND_MAX + m * (i == j); + A[2* (i + m * j) + 1] = B[2 * (i + m * j) + 1] = ((double) rand() / RAND_MAX) * (i != j); + } +} + +//////////////////////// +// error computations // +//////////////////////// +// Each routine x2vecerrr is passed a vector lengh n and two vectors x and y. +// It returns the maximum of the element-wise error between these two vectors. +// This error is the minimum of the absolute difference and the relative +// differene with respect to y. + +double i2vecerr(const int n, const int *x, const int *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = abs(x[i] - y[i]); + double den = abs(y[i]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double s2vecerr(const int n, const float *x, const float *y) { + float error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = fabs((double) x[i] - y[i]); + double den = fabs(y[i]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double d2vecerr(const int n, const double *x, const double *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = fabs(x[i] - y[i]); + double den = fabs(y[i]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double c2vecerr(const int n, const float *x, const float *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = sqrt(((double) x[2 * i] - y[2 * i]) * ((double) x[2 * i] - y[2 * i]) + ((double) x[2 * i + 1] - y[2 * i + 1]) * ((double) x[2 * i + 1] - y[2 * i + 1])); + double den = sqrt((double) y[2 * i] * y[2 * i] + (double) y[2 * i + 1] * y[2 * i + 1]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} + +double z2vecerr(const int n, const double *x, const double *y) { + double error = 0; + int i; + for (i = 0; i < n; i++) { + double nom = sqrt((x[2 * i] - y[2 * i]) * (x[2 * i] - y[2 * i]) + (x[2 * i + 1] - y[2 * i + 1]) * (x[2 * i + 1] - y[2 * i + 1])); + double den = sqrt(y[2 * i] * y[2 * i] + y[2 * i + 1] * y[2 * i + 1]); + error = MAX(error, (den > 0) ? MIN(nom, nom / den) : nom); + } + return error; +} diff --git a/relapack/test/util.h b/relapack/test/util.h new file mode 100644 index 000000000..11d2999e0 --- /dev/null +++ b/relapack/test/util.h @@ -0,0 +1,15 @@ +#ifndef TEST_UTIL_H +#define TEST_UTIL_H + +void s2matgen(int, int, float *, float *); +void d2matgen(int, int, double *, double *); +void c2matgen(int, int, float *, float *); +void z2matgen(int, int, double *, double *); + +double i2vecerr(int, const int *, const int *); +double s2vecerr(int, const float *, const float *); +double d2vecerr(int, const double *, const double *); +double c2vecerr(int, const float *, const float *); +double z2vecerr(int, const double *, const double *); + +#endif /* TEST_UTIL_H */ diff --git a/relapack/test/xgbtrf.c b/relapack/test/xgbtrf.c new file mode 100644 index 000000000..f255006a5 --- /dev/null +++ b/relapack/test/xgbtrf.c @@ -0,0 +1,43 @@ +#include "test.h" + +datatype *A[2]; +int *ipiv[2], info; +int kl, ku, ld; + +void pre() { + int i; + x2matgen(ld, n, A[0], A[1]); + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + ld * i)] = + A[1][x1 * (i + ld * i)] = (datatype) rand() / RAND_MAX; + } + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(ld * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + kl = n - 10; + ku = n; + ld = 2 * kl + ku + 1; + + A[0] = xmalloc(ld * n); + A[1] = xmalloc(ld * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + + #define ROUTINE XPREF(gbtrf) + + TEST(&n, &n, &kl, &ku, A[i], &ld, ipiv[i], &info); + TEST(&n, &n2, &kl, &ku, A[i], &ld, ipiv[i], &info); + TEST(&n2, &n, &kl, &ku, A[i], &ld, ipiv[i], &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); +} diff --git a/relapack/test/xgemmt.c b/relapack/test/xgemmt.c new file mode 100644 index 000000000..ffc37049d --- /dev/null +++ b/relapack/test/xgemmt.c @@ -0,0 +1,65 @@ +#include "test.h" + +datatype *A[2], *B[2], *C[2], *Ctmp; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); + x2matgen(n, n, C[0], C[1]); +} + +void post() { + error = x2vecerr(n * n, C[0], C[1]); +} + +#define ROUTINE XPREF(gemmt) + +#define xlacpy XPREF(LAPACK(lacpy)) +#define xgemm XPREF(BLAS(gemm)) + +extern void xlacpy(const char *, const int *, const int *, const datatype *, const int *, datatype *, const int *); +extern void xgemm(const char *, const char *, const int *, const int *, const int *, const datatype *, const datatype *, const int *, const datatype *, const int *, const datatype *, const datatype *, const int*); + +void XLAPACK(ROUTINE)( + const char *uplo, const char *transA, const char *transB, + const int *n, const int *k, + const datatype *alpha, const datatype *A, const int *ldA, + const datatype *B, const int *ldB, + const datatype *beta, datatype *C, const int *ldC +) { + xlacpy(uplo, n, n, C, ldC, Ctmp, n); + xgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, Ctmp, n); + xlacpy(uplo, n, n, Ctmp, ldC, C, n); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + C[0] = xmalloc(n * n); + C[1] = xmalloc(n * n); + Ctmp = xmalloc(n * n); + + TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n); + TEST("L", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("L", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "N", &n, &n, ONE, A[i], &n, B[i], &n, MONE, C[i], &n); + TEST("U", "N", "N", &n, &n, MONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "T", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "T", "N", &n, &n, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + TEST("U", "N", "N", &n, &n2, ONE, A[i], &n, B[i], &n, ONE, C[i], &n); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); + free(C[0]); + free(C[1]); + free(Ctmp); +} diff --git a/relapack/test/xgetrf.c b/relapack/test/xgetrf.c new file mode 100644 index 000000000..4484a24af --- /dev/null +++ b/relapack/test/xgetrf.c @@ -0,0 +1,32 @@ +#include "test.h" + +datatype *A[2]; +int *ipiv[2], info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + + #define ROUTINE XPREF(getrf) + + TEST(&n, &n, A[i], &n, ipiv[i], &info); + TEST(&n, &n2, A[i], &n, ipiv[i], &info); + TEST(&n2, &n, A[i], &n, ipiv[i], &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); +} diff --git a/relapack/test/xhegst.c b/relapack/test/xhegst.c new file mode 100644 index 000000000..c318ef546 --- /dev/null +++ b/relapack/test/xhegst.c @@ -0,0 +1,32 @@ +#include "test.h" + +datatype *A[2], *B[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + + #define ROUTINE XPREF(hegst) + + TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); +} diff --git a/relapack/test/xhetrf.c b/relapack/test/xhetrf.c new file mode 100644 index 000000000..b5d54bdff --- /dev/null +++ b/relapack/test/xhetrf.c @@ -0,0 +1,40 @@ +#include "test.h" + +datatype *A[2], *Work; +int *ipiv[2], info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + const int lWork = n * n; + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + Work = xmalloc(lWork); + + #define ROUTINE XPREF(hetrf) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + #undef ROUTINE + #define ROUTINE XPREF(hetrf_rook) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); + free(Work); +} diff --git a/relapack/test/xlauum.c b/relapack/test/xlauum.c new file mode 100644 index 000000000..d2c42fa01 --- /dev/null +++ b/relapack/test/xlauum.c @@ -0,0 +1,25 @@ +#include "test.h" + +datatype *A[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(lauum) + + TEST("L", &n, A[i], &n, &info); + TEST("U", &n, A[i], &n, &info); + + free(A[0]); + free(A[1]); +} diff --git a/relapack/test/xpbtrf.c b/relapack/test/xpbtrf.c new file mode 100644 index 000000000..9a9babb6b --- /dev/null +++ b/relapack/test/xpbtrf.c @@ -0,0 +1,40 @@ +#include "test.h" + +datatype *A[2]; +int info[2]; +int n; + +void pre() { + int i; + x2matgen(n, n, A[0], A[1]); + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + n * i)] = + A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + // set first row + A[0][x1 * (n * i)] = + A[1][x1 * (n * i)] = (datatype) rand() / RAND_MAX + n; + } +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(pbtrf) + + const int + kd1 = n / 4, + kd2 = n * 3 / 4; + TEST("L", &n, &kd1, A[i], &n, &info[i]); + TEST("L", &n, &kd2, A[i], &n, &info[i]); + TEST("U", &n, &kd1, A[i] - x1 * kd1, &n, &info[i]); + TEST("U", &n, &kd2, A[i] - x1 * kd2, &n, &info[i]); + + free(A[0]); + free(A[1]); +} diff --git a/relapack/test/xpotrf.c b/relapack/test/xpotrf.c new file mode 100644 index 000000000..5e04d426f --- /dev/null +++ b/relapack/test/xpotrf.c @@ -0,0 +1,25 @@ +#include "test.h" + +datatype *A[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(potrf) + + TEST("L", &n, A[i], &n, &info); + TEST("U", &n, A[i], &n, &info); + + free(A[0]); + free(A[1]); +} diff --git a/relapack/test/xsygst.c b/relapack/test/xsygst.c new file mode 100644 index 000000000..b473a5919 --- /dev/null +++ b/relapack/test/xsygst.c @@ -0,0 +1,32 @@ +#include "test.h" + +datatype *A[2], *B[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + + #define ROUTINE XPREF(sygst) + + TEST(iONE, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iONE, "U", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "L", &n, A[i], &n, B[i], &n, &info); + TEST(iTWO, "U", &n, A[i], &n, B[i], &n, &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); +} diff --git a/relapack/test/xsytrf.c b/relapack/test/xsytrf.c new file mode 100644 index 000000000..82d626f6f --- /dev/null +++ b/relapack/test/xsytrf.c @@ -0,0 +1,40 @@ +#include "test.h" + +datatype *A[2], *Work; +int *ipiv[2], info; + +void pre() { + x2matgen(n, n, A[0], A[1]); + memset(ipiv[0], 0, n * sizeof(int)); + memset(ipiv[1], 0, n * sizeof(int)); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]) + i2vecerr(n, ipiv[0], ipiv[1]); +} + +void tests() { + const int lWork = n * n; + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + ipiv[0] = imalloc(n); + ipiv[1] = imalloc(n); + Work = xmalloc(lWork); + + #define ROUTINE XPREF(sytrf) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + #undef ROUTINE + #define ROUTINE XPREF(sytrf_rook) + + TEST("L", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + TEST("U", &n, A[i], &n, ipiv[i], Work, &lWork, &info); + + free(A[0]); + free(A[1]); + free(ipiv[0]); + free(ipiv[1]); + free(Work); +} diff --git a/relapack/test/xtgsyl.c b/relapack/test/xtgsyl.c new file mode 100644 index 000000000..74db5005e --- /dev/null +++ b/relapack/test/xtgsyl.c @@ -0,0 +1,94 @@ +#include "test.h" + +datatype *A[2], *B[2], *C[2], *D[2], *E[2], *F[2], *Work, scale[2], dif[2]; +int *iWork, lWork, info; + +#define xlascl XPREF(LAPACK(lascl)) +void xlascl(const char *, const int *, const int *, const datatype *, const + datatype *, const int *, const int *, datatype *, const int *, int *); + +#define xscal XPREF(LAPACK(scal)) +void xscal(const int *, const datatype *, datatype *, const int *); + +void pre() { + int i; + + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); + x2matgen(n, n, C[0], C[1]); + x2matgen(n, n, D[0], D[1]); + x2matgen(n, n, E[0], E[1]); + x2matgen(n, n, F[0], F[1]); + + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + n * i)] = + A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + E[0][x1 * (i + n * i)] = + E[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + // clear first subdiagonal + A[0][x1 * (i + 1 + n * i)] = + A[1][x1 * (i + 1 + n * i)] = + B[0][x1 * (i + 1 + n * i)] = + B[1][x1 * (i + 1 + n * i)] = + A[0][x1 * (i + 1 + n * i) + x1 - 1] = + A[1][x1 * (i + 1 + n * i) + x1 - 1] = + B[0][x1 * (i + 1 + n * i) + x1 - 1] = + B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0; + } +} + + +void post() { + if (scale[0] != 1 || scale[0] != 1) + printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]); + if (scale[0]) { + xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info); + xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, F[0], &n, &info); + } + error = x2vecerr(n * n, C[0], C[1]) + x2vecerr(n * n, F[0], F[1]); +} + +void tests() { + lWork = 2 * n * n; + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + C[0] = xmalloc(n * n); + C[1] = xmalloc(n * n); + D[0] = xmalloc(n * n); + D[1] = xmalloc(n * n); + E[0] = xmalloc(n * n); + E[1] = xmalloc(n * n); + F[0] = xmalloc(n * n); + F[1] = xmalloc(n * n); + Work = xmalloc(lWork); + iWork = imalloc(n + n + 2); + + #define ROUTINE XPREF(tgsyl) + + TEST("N", iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iZERO, &n2, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iZERO, &n, &n2, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iTWO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iTHREE, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST("N", iFOUR, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + TEST(xCTRANS, iZERO, &n, &n, A[i], &n, B[i], &n, C[i], &n, D[i], &n, E[i], &n, F[i], &n, &scale[i], &dif[i], Work, &lWork, iWork, &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); + free(C[0]); + free(C[1]); + free(D[0]); + free(D[1]); + free(E[0]); + free(E[1]); + free(F[0]); + free(F[1]); + free(Work); + free(iWork); +} diff --git a/relapack/test/xtrsyl.c b/relapack/test/xtrsyl.c new file mode 100644 index 000000000..358a89242 --- /dev/null +++ b/relapack/test/xtrsyl.c @@ -0,0 +1,65 @@ +#include "test.h" + +datatype *A[2], *B[2], *C[2], *Work, scale[2]; +int info; + +#define xlascl XPREF(LAPACK(lascl)) +void xlascl(const char *, const int *, const int *, const datatype *, const + datatype *, const int *, const int *, datatype *, const int *, int *); + +void pre() { + int i; + + x2matgen(n, n, A[0], A[1]); + x2matgen(n, n, B[0], B[1]); + x2matgen(n, n, C[0], C[1]); + + for (i = 0; i < n; i++) { + // set diagonal + A[0][x1 * (i + n * i)] = + A[1][x1 * (i + n * i)] = (datatype) rand() / RAND_MAX; + // clear first subdiagonal + A[0][x1 * (i + 1 + n * i)] = + A[1][x1 * (i + 1 + n * i)] = + B[0][x1 * (i + 1 + n * i)] = + B[1][x1 * (i + 1 + n * i)] = + A[0][x1 * (i + 1 + n * i) + x1 - 1] = + A[1][x1 * (i + 1 + n * i) + x1 - 1] = + B[0][x1 * (i + 1 + n * i) + x1 - 1] = + B[1][x1 * (i + 1 + n * i) + x1 - 1] = 0; + } +} + +void post() { + if (scale[0] != 1 || scale[0] != 1) + printf("scale[RELAPACK] = %12g\tscale[LAPACK] = %12g\n", scale[0], scale[1]); + if (scale[0]) + xlascl("G", iZERO, iZERO, &scale[0], &scale[1], &n, &n, C[0], &n, &info); + error = x2vecerr(n * n, C[0], C[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + B[0] = xmalloc(n * n); + B[1] = xmalloc(n * n); + C[0] = xmalloc(n * n); + C[1] = xmalloc(n * n); + + #define ROUTINE XPREF(trsyl) + + TEST("N", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "N", iONE, &n2, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "N", iONE, &n, &n2, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("C", "N", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("C", "C", iONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + TEST("N", "N", iMONE, &n, &n, A[i], &n, B[i], &n, C[i], &n, &scale[i], &info); + + free(A[0]); + free(A[1]); + free(B[0]); + free(B[1]); + free(C[0]); + free(C[1]); +} diff --git a/relapack/test/xtrtri.c b/relapack/test/xtrtri.c new file mode 100644 index 000000000..106391bc8 --- /dev/null +++ b/relapack/test/xtrtri.c @@ -0,0 +1,25 @@ +#include "test.h" + +datatype *A[2]; +int info; + +void pre() { + x2matgen(n, n, A[0], A[1]); +} + +void post() { + error = x2vecerr(n * n, A[0], A[1]); +} + +void tests() { + A[0] = xmalloc(n * n); + A[1] = xmalloc(n * n); + + #define ROUTINE XPREF(trtri) + + TEST("L", "N", &n, A[i], &n, &info); + TEST("U", "N", &n, A[i], &n, &info); + + free(A[0]); + free(A[1]); +} diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 9cf518e05..bd31ed9c6 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -21,6 +21,10 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(${OpenBLAS_utest_bin} m) endif() +if (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") +set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES COMPILE_DEFINITIONS "_CRT_SECURE_NO_WARNINGS") +endif() + #Set output for utest set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) |