/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #ifndef COMMON_X86 #define COMMON_X86 #ifndef ASSEMBLER #ifdef C_SUN #define __asm__ __asm #define __volatile__ #endif #ifdef HAVE_SSE2 #define MB __asm__ __volatile__ ("mfence"); #define WMB __asm__ __volatile__ ("sfence"); #else #define MB #define WMB #endif static void __inline blas_lock(volatile BLASULONG *address){ int ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( "xchgl %0, %1\n" : "=r"(ret), "=m"(*address) : "0"(1), "m"(*address) : "memory"); } while (ret); } static __inline BLASULONG rpcc(void){ BLASULONG a, d; __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); return ((BLASULONG)a + ((BLASULONG)d << 32)); } #define RPCC64BIT static __inline BLASULONG getstackaddr(void){ BLASULONG addr; __asm__ __volatile__ ("movq %%rsp, %0" : "=r"(addr) : : "memory"); return addr; } static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ __asm__ __volatile__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op)); } #define WHEREAMI static inline int WhereAmI(void){ int eax, ebx, ecx, edx; int apicid; cpuid(1, &eax, &ebx, &ecx, &edx); apicid = BITMASK(ebx, 24, 0xff); return apicid; } #ifdef CORE_BARCELONA #define IFLUSH gotoblas_iflush() #define IFLUSH_HALF gotoblas_iflush_half() #endif #ifdef ENABLE_SSE_EXCEPTION #define IDEBUG_START \ { \ unsigned int fp_sse_mode, new_fp_mode; \ __asm__ __volatile__ ("stmxcsr %0" : "=m" (fp_sse_mode) : ); \ new_fp_mode = fp_sse_mode & ~0xd00; \ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (new_fp_mode) ); #define IDEBUG_END \ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (fp_sse_mode) ); \ } #endif #ifdef XDOUBLE #define GET_IMAGE(res) __asm__ __volatile__("fstpt %0" : "=m"(res) : : "memory") #elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("movsd %%xmm1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("movss %%xmm1, %0" : "=m"(res) : : "memory") #endif #define GET_IMAGE_CANCEL #ifdef SMP #ifdef USE64BITINT static __inline blasint blas_quickdivide(blasint x, blasint y){ return x / y; } #else extern unsigned int blas_quick_divide_table[]; static __inline int blas_quickdivide(unsigned int x, unsigned int y){ unsigned int result; if (y <= 1) return x; y = blas_quick_divide_table[y]; __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; } #endif #endif #endif #ifndef PAGESIZE #define PAGESIZE ( 4 << 10) #endif #define HUGE_PAGESIZE ( 2 << 20) #define BUFFER_SIZE (32 << 20) #define SEEK_ADDRESS #ifdef F_INTERFACE_G77 #define RETURN_BY_STACK #define NEED_F2CCONV #endif #ifdef F_INTERFACE_G95 #define RETURN_BY_PACKED #endif #ifdef F_INTERFACE_GFORT #ifdef OS_WINDOWS #ifndef DOUBLE #define RETURN_BY_REGS #else #define RETURN_BY_STACK #endif #else #define RETURN_BY_PACKED #endif #endif #ifdef F_INTERFACE_INTEL #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_FUJITSU #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_PGI #define RETURN_BY_STACK #endif #ifdef F_INTERFACE_PATHSCALE #define RETURN_BY_PACKED #endif #ifdef F_INTERFACE_SUN #define RETURN_BY_PACKED #endif #ifdef ASSEMBLER #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif #if defined(HAVE_3DNOW) #define EMMS femms #elif defined(HAVE_MMX) #define EMMS emms #endif #ifndef EMMS #define EMMS #endif #define BRANCH .byte 0x3e #define NOBRANCH .byte 0x2e #define PADDING .byte 0x66 #ifdef OS_WINDOWS #define ARG1 %rcx #define ARG2 %rdx #define ARG3 %r8 #define ARG4 %r9 #else #define ARG1 %rdi #define ARG2 %rsi #define ARG3 %rdx #define ARG4 %rcx #define ARG5 %r8 #define ARG6 %r9 #endif #ifndef COMPLEX #ifdef XDOUBLE #define LOCAL_BUFFER_SIZE QLOCAL_BUFFER_SIZE #elif defined DOUBLE #define LOCAL_BUFFER_SIZE DLOCAL_BUFFER_SIZE #else #define LOCAL_BUFFER_SIZE SLOCAL_BUFFER_SIZE #endif #else #ifdef XDOUBLE #define LOCAL_BUFFER_SIZE XLOCAL_BUFFER_SIZE #elif defined DOUBLE #define LOCAL_BUFFER_SIZE ZLOCAL_BUFFER_SIZE #else #define LOCAL_BUFFER_SIZE CLOCAL_BUFFER_SIZE #endif #endif #if defined(OS_WINDOWS) #if LOCAL_BUFFER_SIZE > 16384 #define STACK_TOUCHING \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif LOCAL_BUFFER_SIZE > 12288 #define STACK_TOUCHING \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif LOCAL_BUFFER_SIZE > 8192 #define STACK_TOUCHING \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif LOCAL_BUFFER_SIZE > 4096 #define STACK_TOUCHING \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCHING #endif #else #define STACK_TOUCHING #endif #if defined(CORE2) #define movapd movaps #define andpd andps #define movlpd movlps #define movhpd movhps #endif #ifndef F_INTERFACE #define REALNAME ASMNAME #else #define REALNAME ASMFNAME #endif #ifdef OS_DARWIN #define PROLOGUE .text;.align 5; .globl REALNAME; REALNAME: #define EPILOGUE .subsections_via_symbols #define PROFCODE #endif #ifdef OS_WINDOWS #define SAVEREGISTERS \ subq $256, %rsp;\ movups %xmm6, 0(%rsp);\ movups %xmm7, 16(%rsp);\ movups %xmm8, 32(%rsp);\ movups %xmm9, 48(%rsp);\ movups %xmm10, 64(%rsp);\ movups %xmm11, 80(%rsp);\ movups %xmm12, 96(%rsp);\ movups %xmm13, 112(%rsp);\ movups %xmm14, 128(%rsp);\ movups %xmm15, 144(%rsp) #define RESTOREREGISTERS \ movups 0(%rsp), %xmm6;\ movups 16(%rsp), %xmm7;\ movups 32(%rsp), %xmm8;\ movups 48(%rsp), %xmm9;\ movups 64(%rsp), %xmm10;\ movups 80(%rsp), %xmm11;\ movups 96(%rsp), %xmm12;\ movups 112(%rsp), %xmm13;\ movups 128(%rsp), %xmm14;\ movups 144(%rsp), %xmm15;\ addq $256, %rsp #else #define SAVEREGISTERS #define RESTOREREGISTERS #endif #if defined(OS_WINDOWS) && !defined(C_PGI) #define PROLOGUE \ .text; \ .align 16; \ .globl REALNAME ;\ .def REALNAME;.scl 2;.type 32;.endef; \ REALNAME: #define PROFCODE #define EPILOGUE .end REALNAME #endif #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ .globl REALNAME ;\ .type REALNAME, @function; \ REALNAME: #ifdef PROFILE #define PROFCODE call *mcount@GOTPCREL(%rip) #else #define PROFCODE #endif #define EPILOGUE \ .size REALNAME, .-REALNAME; \ .section .note.GNU-stack,"",@progbits #endif #endif #ifdef XDOUBLE #define FLD fldt #define FST fstpt #define MOVQ movq #elif defined(DOUBLE) #define FLD fldl #define FST fstpl #define FSTU fstl #define FMUL fmull #define FADD faddl #define MOVSD movsd #define MULSD mulsd #define MULPD mulpd #define CMPEQPD cmpeqpd #define COMISD comisd #define PSRLQ psrlq #define ANDPD andpd #define ADDPD addpd #define ADDSD addsd #define SUBPD subpd #define SUBSD subsd #define MOVQ movq #define MOVUPD movupd #define XORPD xorpd #else #define FLD flds #define FST fstps #define FSTU fsts #define FMUL fmuls #define FADD fadds #define MOVSD movss #define MULSD mulss #define MULPD mulps #define CMPEQPD cmpeqps #define COMISD comiss #define PSRLQ psrld #define ANDPD andps #define ADDPD addps #define ADDSD addss #define SUBPD subps #define SUBSD subss #define MOVQ movd #define MOVUPD movups #define XORPD xorps #endif #define HALT hlt #ifdef OS_DARWIN #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 #define ALIGN_5 .align 5 #define ffreep fstp #endif #ifndef ALIGN_2 #define ALIGN_2 .align 4 #endif #ifndef ALIGN_3 #define ALIGN_3 .align 8 #endif #ifndef ALIGN_4 #define ALIGN_4 .align 16 #endif #ifndef ALIGN_5 #define ALIGN_5 .align 32 #endif #ifndef ALIGN_6 #define ALIGN_6 .align 64 #endif // ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. // Please check out http://www.sandpile.org/x86/opc_fpu.htm #ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif #endif