diff options
author | wernsaar <wernsaar@googlemail.com> | 2013-12-01 13:16:41 +0100 |
---|---|---|
committer | wernsaar <wernsaar@googlemail.com> | 2013-12-01 13:16:41 +0100 |
commit | 4be4db590c14963410508e96833b9b47c6c5d586 (patch) | |
tree | e8fb012d1f5696401ca5de35643eedc5e3b6edfe | |
parent | 9d3fae15a8dba680c6f3369cd580c59b651201dd (diff) | |
parent | 5048a80032a9d020b585ffc59323274f1d14e6b7 (diff) | |
download | openblas-4be4db590c14963410508e96833b9b47c6c5d586.tar.gz openblas-4be4db590c14963410508e96833b9b47c6c5d586.tar.bz2 openblas-4be4db590c14963410508e96833b9b47c6c5d586.zip |
Merge remote branch 'origin/develop' into armv7
-rw-r--r-- | Makefile | 8 | ||||
-rw-r--r-- | Makefile.install | 4 | ||||
-rw-r--r-- | Makefile.system | 14 | ||||
-rw-r--r-- | Makefile.tail | 3 | ||||
-rw-r--r-- | driver/others/blas_server_win32.c | 5 | ||||
-rw-r--r-- | driver/others/memory.c | 8 | ||||
-rw-r--r-- | exports/Makefile | 11 | ||||
-rw-r--r-- | interface/trtri.c | 13 | ||||
-rw-r--r-- | kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S | 16 | ||||
-rw-r--r-- | kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S | 16 | ||||
-rw-r--r-- | lapack-netlib/lapacke/include/lapacke_config.h | 4 | ||||
-rw-r--r-- | lapack/getrf/getrf_parallel.c | 14 | ||||
-rw-r--r-- | lapack/getrf/getrf_parallel_omp.c | 2 | ||||
-rw-r--r-- | lapack/getrf/getrf_single.c | 2 | ||||
-rw-r--r-- | lapack/potrf/potrf_parallel.c | 2 | ||||
-rw-r--r-- | lapack/trtri/Makefile | 4 | ||||
-rw-r--r-- | lapack/trtri/dtrtri_lapack.f | 242 | ||||
-rw-r--r-- | lapack/trtri/trtri_U_single.c | 19 | ||||
-rw-r--r-- | openblas_config_template.h | 16 |
19 files changed, 87 insertions, 316 deletions
@@ -219,10 +219,10 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifndef NOFORTRAN -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "NOOPT = $(LAPACK_FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.install b/Makefile.install index cbe98bc5b..8319b46db 100644 --- a/Makefile.install +++ b/Makefile.install @@ -23,8 +23,8 @@ install : lib.grd #for inc @echo \#ifndef OPENBLAS_CONFIG_H > $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define OPENBLAS_CONFIG_H >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @awk '{print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h diff --git a/Makefile.system b/Makefile.system index 5545de1b1..d6c172f3d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -222,6 +222,11 @@ endif endif endif +# ifeq logical or +ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) +OS_WINDOWS=1 +endif + ifdef QUAD_PRECISION CCOMMON_OPT += -DQUAD_PRECISION NO_EXPRECISION = 1 @@ -477,10 +482,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) -ifneq ($(C_COMPILER), LSB) EXTRALIB += -lgfortran endif -endif ifdef NO_BINARY_MODE ifeq ($(ARCH), mips64) ifdef BINARY64 @@ -861,11 +864,18 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = +#For LAPACK Fortran codes. +LAPACK_FFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FFLAGS)) +LAPACK_FPFLAGS := $(filter-out -fopenmp -mp -openmp -xopenmp=parallel,$(FPFLAGS)) + LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 LAPACK_CFLAGS += -DLAPACK_ILP64 endif +ifdef OS_WINDOWS +LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS +endif ifeq ($(C_COMPILER), LSB) LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE endif diff --git a/Makefile.tail b/Makefile.tail index 53dd0caad..56f8d820c 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -606,7 +606,8 @@ clean :: @if test -d $(ARCH); then \ (cd $(ARCH) && $(MAKE) clean) \ fi - @rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \ + @find . -name '*.o' | xargs rm -rf + @rm -rf *.a *.s *.po *.obj *.i *.so core core.* gmon.out *.cso \ *.csx *.is *~ *.exe *.flame *.pdb *.dwf \ gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \ *.pc *.pcl *.def *.i *.prof linktest.c \ diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bd1069c5e..8723a6fa7 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -441,9 +441,10 @@ int BLASFUNC(blas_thread_shutdown)(void){ if (blas_server_avail){ SetEvent(pool.killed); - + printf("blas_num_threads=%d\n", blas_num_threads); for(i = 0; i < blas_num_threads - 1; i++){ - WaitForSingleObject(blas_threads[i], INFINITE); + WaitForSingleObject(blas_threads[i], 5); //INFINITE); + TerminateThread(blas_threads[i],0); } blas_server_avail = 0; diff --git a/driver/others/memory.c b/driver/others/memory.c index 4f35691ff..35758d13c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -363,7 +363,7 @@ static void *alloc_mmap(void *address){ #define BENCH_ITERATION 4 #define SCALING 2 -static inline BLASULONG run_bench(BLASULONG address, long size) { +static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { BLASULONG original, *p; BLASULONG start, stop, min; @@ -450,12 +450,12 @@ static void *alloc_mmap(void *address){ current = (SCALING - 1) * BUFFER_SIZE; while(current > 0) { - *(long *)start = (long)start + PAGESIZE; + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; start += PAGESIZE; current -= PAGESIZE; } - *(long *)(start - PAGESIZE) = (BLASULONG)map_address; + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; start = (BLASULONG)map_address; @@ -1170,7 +1170,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) - long size; + size_t size; BLASULONG buffer; size = BUFFER_SIZE - PAGESIZE; diff --git a/exports/Makefile b/exports/Makefile index 0bc9ec6e0..8e50a9809 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -111,7 +111,7 @@ libgoto_hpl.def : gensymbol perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) > $(@F) $(LIBDYNNAME) : ../$(LIBNAME) osx.def - $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) symbol.$(SUFFIX) : symbol.S $(CC) $(CFLAGS) -c -o $(@F) $^ @@ -124,14 +124,17 @@ ifeq ($(OSNAME), Linux) so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c +ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) -ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. else -#Use FC on LSB - $(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +#for LSB + env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) + $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. endif rm -f linktest diff --git a/interface/trtri.c b/interface/trtri.c index 007dbd7fa..5aa5e9b9b 100644 --- a/interface/trtri.c +++ b/interface/trtri.c @@ -60,7 +60,6 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT * }; #endif -extern void BLASFUNC(dtrtrilapack)(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info); int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ @@ -133,18 +132,6 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In if (args.nthreads == 1) { #endif -#if DOUBLE - // double trtri_U single thread error - // call dtrtri from lapack for a walk around. - if(uplo==0){ - BLASFUNC(dtrtrilapack)(UPLO, DIAG, N, a, ldA, Info); -#ifndef PPC440 - blas_memory_free(buffer); -#endif - return 0; - } -#endif - *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP diff --git a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S index 374f45096..9e15fa240 100644 --- a/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S +++ b/kernel/x86_64/dtrsm_kernel_LT_8x2_bulldozer.S @@ -103,7 +103,7 @@ vmovups -10*SIZE(AO,%rax,8), %xmm6 vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x2 @@ -265,7 +265,7 @@ vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -338,7 +338,7 @@ vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -378,7 +378,7 @@ vmovups -16*SIZE(BO,%rax,2), %xmm1 vmovddup -16*SIZE(AO,%rax,1), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x2 @@ -411,7 +411,7 @@ vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vmovups -10*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x1 @@ -510,7 +510,7 @@ vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -560,7 +560,7 @@ vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -592,7 +592,7 @@ vmovsd -16*SIZE(BO,%rax,1), %xmm1 vmovsd -16*SIZE(AO,%rax,1), %xmm0 vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x1 diff --git a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S index 8fa53efa7..8d3964aee 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S +++ b/kernel/x86_64/dtrsm_kernel_RN_8x2_bulldozer.S @@ -103,7 +103,7 @@ vmovups -10*SIZE(AO,%rax,8), %xmm6 vfmaddpd %xmm14, %xmm6 , %xmm1 , %xmm14 vfmaddpd %xmm15, %xmm6 , %xmm2 , %xmm15 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x2 @@ -177,7 +177,7 @@ vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vfmaddpd %xmm11, %xmm0 , %xmm2 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -226,7 +226,7 @@ vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vfmaddpd %xmm9 , %xmm0 , %xmm2 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -262,7 +262,7 @@ vmovups -16*SIZE(BO,%rax,2), %xmm1 vmovddup -16*SIZE(AO,%rax,1), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x2 @@ -306,7 +306,7 @@ vfmaddpd %xmm10, %xmm0 , %xmm1 , %xmm10 vmovups -10*SIZE(AO,%rax,8), %xmm0 vfmaddpd %xmm11, %xmm0 , %xmm1 , %xmm11 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_8x1 @@ -347,7 +347,7 @@ vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 vmovups -14*SIZE(AO,%rax,4), %xmm0 vfmaddpd %xmm9 , %xmm0 , %xmm1 , %xmm9 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -377,7 +377,7 @@ vmovddup -16*SIZE(BO,%rax,1), %xmm1 vmovups -16*SIZE(AO,%rax,2), %xmm0 vfmaddpd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm @@ -402,7 +402,7 @@ vmovsd -16*SIZE(BO,%rax,1), %xmm1 vmovsd -16*SIZE(AO,%rax,1), %xmm0 vfmaddsd %xmm8 , %xmm0 , %xmm1 , %xmm8 - addq $SIZE, %rax + addq $ SIZE, %rax .endm .macro SOLVE_1x1 diff --git a/lapack-netlib/lapacke/include/lapacke_config.h b/lapack-netlib/lapacke/include/lapacke_config.h index 1e2509bf0..561b2736b 100644 --- a/lapack-netlib/lapacke/include/lapacke_config.h +++ b/lapack-netlib/lapacke/include/lapacke_config.h @@ -45,7 +45,11 @@ extern "C" { #ifndef lapack_int #if defined(LAPACK_ILP64) +#if defined(OPENBLAS_OS_WINDOWS) +#define lapack_int long long +#else #define lapack_int long +#endif #else #define lapack_int int #endif diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 21ea9d5f5..3dbc70e9d 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -67,14 +67,14 @@ double sqrt(double); #undef GETRF_FACTOR #define GETRF_FACTOR 1.00 -static inline long FORMULA1(long M, long N, long IS, long BK, long T) { +static inline BLASLONG FORMULA1(BLASLONG M, BLASLONG N, BLASLONG IS, BLASLONG BK, BLASLONG T) { double m = (double)(M - IS - BK); double n = (double)(N - IS - BK); double b = (double)BK; double a = (double)T; - return (long)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); + return (BLASLONG)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); } @@ -111,7 +111,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); - sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } @@ -221,7 +221,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); - sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } @@ -448,7 +448,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); - sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); is = 0; num_cpu = 0; @@ -685,7 +685,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (width > n - init_bk) width = n - init_bk; if (width < init_bk) { - long temp; + BLASLONG temp; temp = FORMULA2(m, n, 0, init_bk, args -> nthreads); temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); @@ -708,7 +708,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, is = 0; num_cpu = 0; - sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); while (is < mn) { diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index 4922b9b52..6eda30a52 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -178,7 +178,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, return info; } - sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index fcea0ae89..f1818ea97 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -82,7 +82,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, return info; } - sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + sbb = (FLOAT *)((((BLASULONG)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index eec9b6e05..11f7f533c 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -185,7 +185,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); - buffer[0] = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; } diff --git a/lapack/trtri/Makefile b/lapack/trtri/Makefile index 10d3cb7fd..626c47bbf 100644 --- a/lapack/trtri/Makefile +++ b/lapack/trtri/Makefile @@ -13,7 +13,6 @@ ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_sing XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX) -DBLASOBJS += dtrtri_lapack.$(SUFFIX) ifdef SMP SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX) @@ -54,9 +53,6 @@ dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) -dtrtri_lapack.$(SUFFIX) : dtrtri_lapack.f - $(FC) -c $(FFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) - dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) diff --git a/lapack/trtri/dtrtri_lapack.f b/lapack/trtri/dtrtri_lapack.f deleted file mode 100644 index 8e9a08170..000000000 --- a/lapack/trtri/dtrtri_lapack.f +++ /dev/null @@ -1,242 +0,0 @@ -*> \brief \b DTRTRI -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -*> \htmlonly -*> Download DTRTRI + dependencies -*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dtrtri.f"> -*> [TGZ]</a> -*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dtrtri.f"> -*> [ZIP]</a> -*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dtrtri.f"> -*> [TXT]</a> -*> \endhtmlonly -* -* Definition: -* =========== -* -* SUBROUTINE DTRTRI( UPLO, DIAG, N, A, LDA, INFO ) -* -* .. Scalar Arguments .. -* CHARACTER DIAG, UPLO -* INTEGER INFO, LDA, N -* .. -* .. Array Arguments .. -* DOUBLE PRECISION A( LDA, * ) -* .. -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> DTRTRI computes the inverse of a real upper or lower triangular -*> matrix A. -*> -*> This is the Level 3 BLAS version of the algorithm. -*> \endverbatim -* -* Arguments: -* ========== -* -*> \param[in] UPLO -*> \verbatim -*> UPLO is CHARACTER*1 -*> = 'U': A is upper triangular; -*> = 'L': A is lower triangular. -*> \endverbatim -*> -*> \param[in] DIAG -*> \verbatim -*> DIAG is CHARACTER*1 -*> = 'N': A is non-unit triangular; -*> = 'U': A is unit triangular. -*> \endverbatim -*> -*> \param[in] N -*> \verbatim -*> N is INTEGER -*> The order of the matrix A. N >= 0. -*> \endverbatim -*> -*> \param[in,out] A -*> \verbatim -*> A is DOUBLE PRECISION array, dimension (LDA,N) -*> On entry, the triangular matrix A. If UPLO = 'U', the -*> leading N-by-N upper triangular part of the array A contains -*> the upper triangular matrix, and the strictly lower -*> triangular part of A is not referenced. If UPLO = 'L', the -*> leading N-by-N lower triangular part of the array A contains -*> the lower triangular matrix, and the strictly upper -*> triangular part of A is not referenced. If DIAG = 'U', the -*> diagonal elements of A are also not referenced and are -*> assumed to be 1. -*> On exit, the (triangular) inverse of the original matrix, in -*> the same storage format. -*> \endverbatim -*> -*> \param[in] LDA -*> \verbatim -*> LDA is INTEGER -*> The leading dimension of the array A. LDA >= max(1,N). -*> \endverbatim -*> -*> \param[out] INFO -*> \verbatim -*> INFO is INTEGER -*> = 0: successful exit -*> < 0: if INFO = -i, the i-th argument had an illegal value -*> > 0: if INFO = i, A(i,i) is exactly zero. The triangular -*> matrix is singular and its inverse can not be computed. -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2011 -* -*> \ingroup doubleOTHERcomputational -* -* ===================================================================== - SUBROUTINE DTRTRILAPACK( UPLO, DIAG, N, A, LDA, INFO ) -* -* -- LAPACK computational routine (version 3.4.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 -* -* .. Scalar Arguments .. - CHARACTER DIAG, UPLO - INTEGER INFO, LDA, N -* .. -* .. Array Arguments .. - DOUBLE PRECISION A( LDA, * ) -* .. -* -* ===================================================================== -* -* .. Parameters .. - DOUBLE PRECISION ONE, ZERO - PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) -* .. -* .. Local Scalars .. - LOGICAL NOUNIT, UPPER - INTEGER J, JB, NB, NN -* .. -* .. External Functions .. - LOGICAL LSAME - INTEGER ILAENV - EXTERNAL LSAME, ILAENV -* .. -* .. External Subroutines .. - EXTERNAL DTRMM, DTRSM, DTRTI2, XERBLA -* .. -* .. Intrinsic Functions .. - INTRINSIC MAX, MIN -* .. -* .. Executable Statements .. -* -* Test the input parameters. -* - INFO = 0 - UPPER = LSAME( UPLO, 'U' ) - NOUNIT = LSAME( DIAG, 'N' ) - IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN - INFO = -1 - ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN - INFO = -2 - ELSE IF( N.LT.0 ) THEN - INFO = -3 - ELSE IF( LDA.LT.MAX( 1, N ) ) THEN - INFO = -5 - END IF - IF( INFO.NE.0 ) THEN - CALL XERBLA( 'DTRTRI', -INFO ) - RETURN - END IF -* -* Quick return if possible -* - IF( N.EQ.0 ) - $ RETURN -* -* Check for singularity if non-unit. -* - IF( NOUNIT ) THEN - DO 10 INFO = 1, N - IF( A( INFO, INFO ).EQ.ZERO ) - $ RETURN - 10 CONTINUE - INFO = 0 - END IF -* -* Determine the block size for this environment. -* - NB = ILAENV( 1, 'DTRTRI', UPLO // DIAG, N, -1, -1, -1 ) - IF( NB.LE.1 .OR. NB.GE.N ) THEN -* -* Use unblocked code -* - CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO ) - ELSE -* -* Use blocked code -* - IF( UPPER ) THEN -* -* Compute inverse of upper triangular matrix -* - DO 20 J = 1, N, NB - JB = MIN( NB, N-J+1 ) -* -* Compute rows 1:j-1 of current block column -* - CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, - $ JB, ONE, A, LDA, A( 1, J ), LDA ) - CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, - $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) -* -* Compute inverse of current diagonal block -* - CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) - 20 CONTINUE - ELSE -* -* Compute inverse of lower triangular matrix -* - NN = ( ( N-1 ) / NB )*NB + 1 - DO 30 J = NN, 1, -NB - JB = MIN( NB, N-J+1 ) - IF( J+JB.LE.N ) THEN -* -* Compute rows j+jb:n of current block column -* - CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG, - $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, - $ A( J+JB, J ), LDA ) - CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG, - $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, - $ A( J+JB, J ), LDA ) - END IF -* -* Compute inverse of current diagonal block -* - CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) - 30 CONTINUE - END IF - END IF -* - RETURN -* -* End of DTRTRI -* - END diff --git a/lapack/trtri/trtri_U_single.c b/lapack/trtri/trtri_U_single.c index 72133d896..c79281cfb 100644 --- a/lapack/trtri/trtri_U_single.c +++ b/lapack/trtri/trtri_U_single.c @@ -127,8 +127,14 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, if (min_i > GEMM_P) min_i = GEMM_P; if (ls == i + bk) { - NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); - + //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + GEMM_BETA(min_i, bk, 0, dm1, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); + TRSM_KERNEL_RN(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, @@ -171,8 +177,13 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, min_i = i - is; if (min_i > GEMM_P) min_i = GEMM_P; - NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); - + //NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + GEMM_BETA(min_i, bk, 0, dm1, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, a + (is + i * lda) * COMPSIZE, lda); + TRSM_KERNEL_RN(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, diff --git a/openblas_config_template.h b/openblas_config_template.h index 1017caff9..3b3435b0e 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -1,8 +1,8 @@ /*This is only for "make install" target.*/ -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) -#define WINDOWS_ABI -#define OS_WINDOWS +#if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX) +#define OPENBLAS_WINDOWS_ABI +#define OPENBLAS_OS_WINDOWS #ifdef DOUBLE #define DOUBLE_DEFINED DOUBLE @@ -10,23 +10,23 @@ #endif #endif -#ifdef NEEDBUNDERSCORE +#ifdef OPENBLAS_NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ #else #define BLASFUNC(FUNC) FUNC #endif -#ifdef QUAD_PRECISION +#ifdef OPENBLAS_QUAD_PRECISION typedef struct { unsigned long x[2]; } xdouble; -#elif defined EXPRECISION +#elif defined OPENBLAS_EXPRECISION #define xdouble long double #else #define xdouble double #endif -#if defined(OS_WINDOWS) && defined(__64BIT__) +#if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__) typedef long long BLASLONG; typedef unsigned long long BLASULONG; #else @@ -34,7 +34,7 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#ifdef USE64BITINT +#ifdef OPENBLAS_USE64BITINT typedef BLASLONG blasint; #else typedef int blasint; |