diff options
author | Martin Kroeker <martin@ruby.chemie.uni-freiburg.de> | 2018-12-03 12:50:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-12-03 12:50:14 +0100 |
commit | c5f8aeff2d2ab7c199d0172e6e743eb61d748d7a (patch) | |
tree | e26e1247b0b0b780886fea123dbec027bb558d33 | |
parent | 44c81fd1355cef9b07189ebaad061709be0cd7c6 (diff) | |
parent | 8278cbe7f816f6b1bfb76ffe48b42de352282cfa (diff) | |
download | openblas-c5f8aeff2d2ab7c199d0172e6e743eb61d748d7a.tar.gz openblas-c5f8aeff2d2ab7c199d0172e6e743eb61d748d7a.tar.bz2 openblas-c5f8aeff2d2ab7c199d0172e6e743eb61d748d7a.zip |
Merge branch 'develop' into fbsd12
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | Changelog.txt | 73 | ||||
-rw-r--r-- | Makefile.rule | 2 | ||||
-rw-r--r-- | Makefile.system | 4 | ||||
-rw-r--r-- | cmake/system.cmake | 2 | ||||
-rw-r--r-- | cmake/system_check.cmake | 10 | ||||
-rw-r--r-- | cpuid_power.c | 4 | ||||
-rw-r--r-- | driver/others/memory.c | 8 | ||||
-rw-r--r-- | driver/others/openblas_get_config.c | 5 | ||||
-rw-r--r-- | kernel/mips64/KERNEL | 11 | ||||
-rw-r--r-- | kernel/mips64/KERNEL.LOONGSON3A | 1 | ||||
-rw-r--r-- | kernel/mips64/sgemm_kernel_8x4_ps.S | 36 | ||||
-rw-r--r-- | kernel/x86_64/sgemm_beta_skylakex.c | 2 |
13 files changed, 126 insertions, 34 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index ca951d401..24c169afe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 4.dev) +set(OpenBLAS_PATCH_VERSION 5.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions diff --git a/Changelog.txt b/Changelog.txt index faecd82e3..0dd17a558 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,5 +1,78 @@ OpenBLAS ChangeLog ==================================================================== +Version 0.3.4 +02-Dec-2018 + +common: + * the new, experimental thread-local memory allocation had + inadvertently been left enabled for gmake builds in 0.3.3 + despite the announcement. It is now disabled by default, and + single-threaded builds will keep using the old allocator even + if the USE_TLS option is turned on. + * OpenBLAS will now provide enough buffer space for at least 50 + threads by default. + * The output of openblas_get_config() now contains the version + number. + * A serious thread safety bug in GEMV operation with small M and + large N size has been fixed. + * The code will now automatically call blas_thread_init after a + fork if needed before handling a call to openblas_set_num_threads + * Accesses to parallelized level3 functions from multiple callers + are now serialized to avoid thread races (unless using OpenMP). + This should provide better performance than the known-threadsafe + (but non-default) USE_SIMPLE_THREADED_LEVEL3 option. + * When building LAPACK with gfortran, -frecursive is now (again) + enabled by default to ensure correct behaviour. + * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and + CBLAS_LAYOUT as the name of the matrix row/column order option. + * Externally set LDFLAGS are now passed through to the final compile/link + steps to facilitate setting platform-specific linker flags. + * A potential race condition during the build of LAPACK (that would + usually manifest itself as a failure to build TESTING/MATGEN) has been + fixed. + * xHEMV has been changed to stay single-threaded for small input sizes + where the overhead of multithreading exceeds any possible gains + * CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or + ThunderX hardware with sizable input. + * Linker flags for the PGI compiler have been updated + * Behaviour of AXPY with zero increments is now handled in the C interface, + correcting the result on at least Intel Atom. + * The result matrix from calling SGELSS with an all-zero input matrix is + now zeroed completely. + +x86_64: + * Autodetection of AMD Ryzen2 has been fixed (again). + * CMAKE builds now support labeling of an INTERFACE64=1 build of + the library with the _64 suffix. + * AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel + has been sped up by rewriting with C intrinsics + * Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS) + +POWER: + * added support for building on AIX (with gcc and GNU tools from AIX Toolbox). + * CPU type detection has been implemented for AIX. + * CPU type detection has been fixed for NETBSD. + +MIPS64: + * AXPY on LOONGSON3A has been corrected to pass "zero increment" utest. + * DSDOT on LOONGSON3A has been fixed. + * the SGEMM microkernel has been hardened against potential data loss. + +ARMV8: + * DYNAMic_ARCH support is now available for 64bit ARM + * cross-compiling for ARMV8 under iOS now works. + * cpu-specific code has been rearranged to make better use of both + hardware commonalities and model-specific compiler optimizations. + * XGENE1 has been removed as a TARGET, superseded by the improved generic + ARMV8 support. + +ARMV7: + * Older assembly mnemonics have been converted to UAL form to allow + building with clang 7.0 + * Cross compiling LAPACKE for Android has been fixed again (broken by + update to LAPACK 3.7.0 some while ago). + +==================================================================== Version 0.3.3 31-Aug-2018 diff --git a/Makefile.rule b/Makefile.rule index d97607f2e..0d5b83b39 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.4.dev +VERSION = 0.3.5.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index 25ac38dc0..3987460ec 100644 --- a/Makefile.system +++ b/Makefile.system @@ -18,7 +18,7 @@ else ifeq ($(ARCH), i386) override ARCH=x86 else ifeq ($(ARCH), aarch64) override ARCH=arm64 -endif +endif NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib @@ -1042,6 +1042,8 @@ ifdef USE_TLS CCOMMON_OPT += -DUSE_TLS endif +CCOMMON_OPT += -DVERSION=\"$(VERSION)\" + ifndef SYMBOLPREFIX SYMBOLPREFIX = endif diff --git a/cmake/system.cmake b/cmake/system.cmake index 61f96edb0..d803bb9eb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -310,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION) set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") endif () +set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"") + set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index fe30c7600..6b602c1b0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS") set(HOST_OS WINNT) endif () +if (${HOST_OS} STREQUAL "LINUX") +# check if we're building natively on Android (TERMUX) + EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) + if(${OPERATING_SYSTEM} MATCHES "Android") + set(HOST_OS ANDROID) + endif(${OPERATING_SYSTEM} MATCHES "Android") +endif() + + + if(CMAKE_COMPILER_IS_GNUCC AND WIN32) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE diff --git a/cpuid_power.c b/cpuid_power.c index fc36f8e2c..23e98ebb0 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -175,9 +175,9 @@ int detect(void){ return CPUTYPE_PPC970; #endif -#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) +#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) int id; -id = __asm __volatile("mfpvr %0" : "=r"(id)); +__asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { case 0x4e: // POWER9 return CPUTYPE_POWER8; diff --git a/driver/others/memory.c b/driver/others/memory.c index 25f198623..36815a39c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2586,7 +2586,7 @@ void *blas_memory_alloc(int procpos){ printf("Alloc Start ...\n"); #endif -#if defined(WHEREAMI) && !defined(USE_OPENMP) +/* #if defined(WHEREAMI) && !defined(USE_OPENMP) mypos = WhereAmI(); @@ -2596,12 +2596,12 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { LOCK_COMMAND(&alloc_lock); -/* blas_lock(&memory[position].lock);*/ +// blas_lock(&memory[position].lock); if (!memory[position].used) goto allocation; UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ +// blas_unlock(&memory[position].lock); } position ++; @@ -2609,7 +2609,7 @@ void *blas_memory_alloc(int procpos){ } while (position < NUM_BUFFERS); -#endif +#endif */ position = 0; diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 3e87f2cc2..eca494dca 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -42,8 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif static char* openblas_config_str="" +"OpenBLAS " + VERSION +" " #ifdef USE64BITINT - "USE64BITINT " + " USE64BITINT " #endif #ifdef NO_CBLAS "NO_CBLAS " diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index e257dcfc9..61da7445f 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -1,12 +1,13 @@ CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c -SROTKERNEL = ../mips/rot.c -DROTKERNEL = ../mips/rot.c -CROTKERNEL = ../mips/zrot.c -ZROTKERNEL = ../mips/zrot.c +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c - + + ifndef SNRM2KERNEL SNRM2KERNEL = snrm2.S endif diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 2d03ad7fa..0298faaad 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -63,6 +63,7 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DSDOTKERNEL = ../mips/dot.c diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S index 37b20a880..82703ff5d 100644 --- a/kernel/mips64/sgemm_kernel_8x4_ps.S +++ b/kernel/mips64/sgemm_kernel_8x4_ps.S @@ -146,11 +146,11 @@ sd $21, 40($sp) sd $22, 48($sp) - ST $f24, 56($sp) - ST $f25, 64($sp) - ST $f26, 72($sp) - ST $f27, 80($sp) - ST $f28, 88($sp) + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) @@ -161,10 +161,10 @@ #endif #ifndef __64BIT__ - ST $f20,120($sp) - ST $f21,128($sp) - ST $f22,136($sp) - ST $f23,144($sp) + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) #endif .align 4 @@ -7766,11 +7766,11 @@ ld $21, 40($sp) ld $22, 48($sp) - LD $f24, 56($sp) - LD $f25, 64($sp) - LD $f26, 72($sp) - LD $f27, 80($sp) - LD $f28, 88($sp) + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) @@ -7779,10 +7779,10 @@ #endif #ifndef __64BIT__ - LD $f20,120($sp) - LD $f21,128($sp) - LD $f22,136($sp) - LD $f23,144($sp) + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 4e40acadf..498c46f0d 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, } if (n == 0 || m == 0) - return; + return 0; c_offset = c; |