summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Kroeker <martin@ruby.chemie.uni-freiburg.de>2018-12-03 12:50:14 +0100
committerGitHub <noreply@github.com>2018-12-03 12:50:14 +0100
commitc5f8aeff2d2ab7c199d0172e6e743eb61d748d7a (patch)
treee26e1247b0b0b780886fea123dbec027bb558d33
parent44c81fd1355cef9b07189ebaad061709be0cd7c6 (diff)
parent8278cbe7f816f6b1bfb76ffe48b42de352282cfa (diff)
downloadopenblas-c5f8aeff2d2ab7c199d0172e6e743eb61d748d7a.tar.gz
openblas-c5f8aeff2d2ab7c199d0172e6e743eb61d748d7a.tar.bz2
openblas-c5f8aeff2d2ab7c199d0172e6e743eb61d748d7a.zip
Merge branch 'develop' into fbsd12
-rw-r--r--CMakeLists.txt2
-rw-r--r--Changelog.txt73
-rw-r--r--Makefile.rule2
-rw-r--r--Makefile.system4
-rw-r--r--cmake/system.cmake2
-rw-r--r--cmake/system_check.cmake10
-rw-r--r--cpuid_power.c4
-rw-r--r--driver/others/memory.c8
-rw-r--r--driver/others/openblas_get_config.c5
-rw-r--r--kernel/mips64/KERNEL11
-rw-r--r--kernel/mips64/KERNEL.LOONGSON3A1
-rw-r--r--kernel/mips64/sgemm_kernel_8x4_ps.S36
-rw-r--r--kernel/x86_64/sgemm_beta_skylakex.c2
13 files changed, 126 insertions, 34 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca951d401..24c169afe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 4.dev)
+set(OpenBLAS_PATCH_VERSION 5.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
diff --git a/Changelog.txt b/Changelog.txt
index faecd82e3..0dd17a558 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,5 +1,78 @@
OpenBLAS ChangeLog
====================================================================
+Version 0.3.4
+02-Dec-2018
+
+common:
+ * the new, experimental thread-local memory allocation had
+ inadvertently been left enabled for gmake builds in 0.3.3
+ despite the announcement. It is now disabled by default, and
+ single-threaded builds will keep using the old allocator even
+ if the USE_TLS option is turned on.
+ * OpenBLAS will now provide enough buffer space for at least 50
+ threads by default.
+ * The output of openblas_get_config() now contains the version
+ number.
+ * A serious thread safety bug in GEMV operation with small M and
+ large N size has been fixed.
+ * The code will now automatically call blas_thread_init after a
+ fork if needed before handling a call to openblas_set_num_threads
+ * Accesses to parallelized level3 functions from multiple callers
+ are now serialized to avoid thread races (unless using OpenMP).
+ This should provide better performance than the known-threadsafe
+ (but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
+ * When building LAPACK with gfortran, -frecursive is now (again)
+ enabled by default to ensure correct behaviour.
+ * The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
+ CBLAS_LAYOUT as the name of the matrix row/column order option.
+ * Externally set LDFLAGS are now passed through to the final compile/link
+ steps to facilitate setting platform-specific linker flags.
+ * A potential race condition during the build of LAPACK (that would
+ usually manifest itself as a failure to build TESTING/MATGEN) has been
+ fixed.
+ * xHEMV has been changed to stay single-threaded for small input sizes
+ where the overhead of multithreading exceeds any possible gains
+ * CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
+ ThunderX hardware with sizable input.
+ * Linker flags for the PGI compiler have been updated
+ * Behaviour of AXPY with zero increments is now handled in the C interface,
+ correcting the result on at least Intel Atom.
+ * The result matrix from calling SGELSS with an all-zero input matrix is
+ now zeroed completely.
+
+x86_64:
+ * Autodetection of AMD Ryzen2 has been fixed (again).
+ * CMAKE builds now support labeling of an INTERFACE64=1 build of
+ the library with the _64 suffix.
+ * AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
+ has been sped up by rewriting with C intrinsics
+ * Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
+
+POWER:
+ * added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
+ * CPU type detection has been implemented for AIX.
+ * CPU type detection has been fixed for NETBSD.
+
+MIPS64:
+ * AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
+ * DSDOT on LOONGSON3A has been fixed.
+ * the SGEMM microkernel has been hardened against potential data loss.
+
+ARMV8:
+ * DYNAMic_ARCH support is now available for 64bit ARM
+ * cross-compiling for ARMV8 under iOS now works.
+ * cpu-specific code has been rearranged to make better use of both
+ hardware commonalities and model-specific compiler optimizations.
+ * XGENE1 has been removed as a TARGET, superseded by the improved generic
+ ARMV8 support.
+
+ARMV7:
+ * Older assembly mnemonics have been converted to UAL form to allow
+ building with clang 7.0
+ * Cross compiling LAPACKE for Android has been fixed again (broken by
+ update to LAPACK 3.7.0 some while ago).
+
+====================================================================
Version 0.3.3
31-Aug-2018
diff --git a/Makefile.rule b/Makefile.rule
index d97607f2e..0d5b83b39 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
#
# This library's version
-VERSION = 0.3.4.dev
+VERSION = 0.3.5.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.system b/Makefile.system
index 25ac38dc0..3987460ec 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -18,7 +18,7 @@ else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
-endif
+endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
@@ -1042,6 +1042,8 @@ ifdef USE_TLS
CCOMMON_OPT += -DUSE_TLS
endif
+CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
+
ifndef SYMBOLPREFIX
SYMBOLPREFIX =
endif
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 61f96edb0..d803bb9eb 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -310,6 +310,8 @@ if (MIXED_MEMORY_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
endif ()
+set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
+
set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index fe30c7600..6b602c1b0 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
set(HOST_OS WINNT)
endif ()
+if (${HOST_OS} STREQUAL "LINUX")
+# check if we're building natively on Android (TERMUX)
+ EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
+ if(${OPERATING_SYSTEM} MATCHES "Android")
+ set(HOST_OS ANDROID)
+ endif(${OPERATING_SYSTEM} MATCHES "Android")
+endif()
+
+
+
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
diff --git a/cpuid_power.c b/cpuid_power.c
index fc36f8e2c..23e98ebb0 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -175,9 +175,9 @@ int detect(void){
return CPUTYPE_PPC970;
#endif
-#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
int id;
-id = __asm __volatile("mfpvr %0" : "=r"(id));
+__asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return CPUTYPE_POWER8;
diff --git a/driver/others/memory.c b/driver/others/memory.c
index 25f198623..36815a39c 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2586,7 +2586,7 @@ void *blas_memory_alloc(int procpos){
printf("Alloc Start ...\n");
#endif
-#if defined(WHEREAMI) && !defined(USE_OPENMP)
+/* #if defined(WHEREAMI) && !defined(USE_OPENMP)
mypos = WhereAmI();
@@ -2596,12 +2596,12 @@ void *blas_memory_alloc(int procpos){
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
LOCK_COMMAND(&alloc_lock);
-/* blas_lock(&memory[position].lock);*/
+// blas_lock(&memory[position].lock);
if (!memory[position].used) goto allocation;
UNLOCK_COMMAND(&alloc_lock);
-/* blas_unlock(&memory[position].lock);*/
+// blas_unlock(&memory[position].lock);
}
position ++;
@@ -2609,7 +2609,7 @@ void *blas_memory_alloc(int procpos){
} while (position < NUM_BUFFERS);
-#endif
+#endif */
position = 0;
diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c
index 3e87f2cc2..eca494dca 100644
--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@@ -42,8 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
static char* openblas_config_str=""
+"OpenBLAS "
+ VERSION
+" "
#ifdef USE64BITINT
- "USE64BITINT "
+ " USE64BITINT "
#endif
#ifdef NO_CBLAS
"NO_CBLAS "
diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL
index e257dcfc9..61da7445f 100644
--- a/kernel/mips64/KERNEL
+++ b/kernel/mips64/KERNEL
@@ -1,12 +1,13 @@
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
-SROTKERNEL = ../mips/rot.c
-DROTKERNEL = ../mips/rot.c
-CROTKERNEL = ../mips/zrot.c
-ZROTKERNEL = ../mips/zrot.c
+SROTKERNEL = ../mips/rot.c
+DROTKERNEL = ../mips/rot.c
+CROTKERNEL = ../mips/zrot.c
+ZROTKERNEL = ../mips/zrot.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
-
+
+
ifndef SNRM2KERNEL
SNRM2KERNEL = snrm2.S
endif
diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A
index 2d03ad7fa..0298faaad 100644
--- a/kernel/mips64/KERNEL.LOONGSON3A
+++ b/kernel/mips64/KERNEL.LOONGSON3A
@@ -63,6 +63,7 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+DSDOTKERNEL = ../mips/dot.c
diff --git a/kernel/mips64/sgemm_kernel_8x4_ps.S b/kernel/mips64/sgemm_kernel_8x4_ps.S
index 37b20a880..82703ff5d 100644
--- a/kernel/mips64/sgemm_kernel_8x4_ps.S
+++ b/kernel/mips64/sgemm_kernel_8x4_ps.S
@@ -146,11 +146,11 @@
sd $21, 40($sp)
sd $22, 48($sp)
- ST $f24, 56($sp)
- ST $f25, 64($sp)
- ST $f26, 72($sp)
- ST $f27, 80($sp)
- ST $f28, 88($sp)
+ sdc1 $f24, 56($sp)
+ sdc1 $f25, 64($sp)
+ sdc1 $f26, 72($sp)
+ sdc1 $f27, 80($sp)
+ sdc1 $f28, 88($sp)
#if defined(TRMMKERNEL)
sd $23, 96($sp)
@@ -161,10 +161,10 @@
#endif
#ifndef __64BIT__
- ST $f20,120($sp)
- ST $f21,128($sp)
- ST $f22,136($sp)
- ST $f23,144($sp)
+ sdc1 $f20,120($sp)
+ sdc1 $f21,128($sp)
+ sdc1 $f22,136($sp)
+ sdc1 $f23,144($sp)
#endif
.align 4
@@ -7766,11 +7766,11 @@
ld $21, 40($sp)
ld $22, 48($sp)
- LD $f24, 56($sp)
- LD $f25, 64($sp)
- LD $f26, 72($sp)
- LD $f27, 80($sp)
- LD $f28, 88($sp)
+ ldc1 $f24, 56($sp)
+ ldc1 $f25, 64($sp)
+ ldc1 $f26, 72($sp)
+ ldc1 $f27, 80($sp)
+ ldc1 $f28, 88($sp)
#if defined(TRMMKERNEL)
ld $23, 96($sp)
@@ -7779,10 +7779,10 @@
#endif
#ifndef __64BIT__
- LD $f20,120($sp)
- LD $f21,128($sp)
- LD $f22,136($sp)
- LD $f23,144($sp)
+ ldc1 $f20,120($sp)
+ ldc1 $f21,128($sp)
+ ldc1 $f22,136($sp)
+ ldc1 $f23,144($sp)
#endif
daddiu $sp,$sp,STACKSIZE
diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c
index 4e40acadf..498c46f0d 100644
--- a/kernel/x86_64/sgemm_beta_skylakex.c
+++ b/kernel/x86_64/sgemm_beta_skylakex.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
}
if (n == 0 || m == 0)
- return;
+ return 0;
c_offset = c;