diff options
Diffstat (limited to 'src/pal/src/arch/amd64/optimizedtls.cpp')
-rw-r--r-- | src/pal/src/arch/amd64/optimizedtls.cpp | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/src/pal/src/arch/amd64/optimizedtls.cpp b/src/pal/src/arch/amd64/optimizedtls.cpp new file mode 100644 index 0000000000..cd89db6b0a --- /dev/null +++ b/src/pal/src/arch/amd64/optimizedtls.cpp @@ -0,0 +1,237 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +/*++ + + + +Module Name: + + optimizedtls.cpp + +Abstract: + + Implementation of platform-specific Thread local storage functions. + + + +--*/ + +#include "pal/thread.hpp" +#include "pal/malloc.hpp" + +#include <pthread.h> + +#include "pal/dbgmsg.h" +#include "pal/misc.h" +#include "pal/debug.h" + +#include <stddef.h> + +using namespace CorUnix; + +SET_DEFAULT_DEBUG_CHANNEL(THREAD); + +#if defined(USE_OPTIMIZEDTLSGETTER) + +#define PAL_safe_offsetof(s,m) ((size_t)((ptrdiff_t)&(char&)(((s *)64)->m))-64) + +/*++ +Function: + CorUnix::TLSMakeOptimizedGetter + + Creates a platform-optimized version of TlsGetValue compiled + for a particular index. + + Generates the hot part of CorUnix::InternalGetCurrentThread + as a chunk of highly optimized machine-specific code at runtime. + + Check the difference between CorUnix::InternalGetCurrentThread and + CorUnix::InternalGetCurrentThreadSlow to see the C/C++ code that matches + the code generated by this function. +--*/ +PAL_POPTIMIZEDTLSGETTER +CorUnix::TLSMakeOptimizedGetter( + IN CPalThread* pThread, + IN DWORD dwTlsIndex) +{ +#ifdef BIT64 +#pragma unused(pThread, dwTlsIndex) + ERROR("TLSMakeOptimizedGetter not rewritten for amd64 yet."); + return NULL; +#else + PAL_POPTIMIZEDTLSGETTER Ret = NULL; + BYTE* p; + int i = 0; + +#ifdef __APPLE__ +#define TLS_OPTIMIZED_GETTER_SIZE 118 +#else +#define TLS_OPTIMIZED_GETTER_SIZE 115 +#endif + + p = (BYTE*)InternalMalloc(pThread, TLS_OPTIMIZED_GETTER_SIZE * sizeof(BYTE)); + + if (p == NULL) + { + return Ret; + } + + // Need to preserve %ecx, %edx, and %esi registers as specified in + // GetThreadGeneric(void) in vm/amd64/asmhelpers.s + p[i++] = 0x51; // push %ecx + p[i++] = 0x52; // push %edx + p[i++] = 0x89; // mov %esp,%eax // %eax = sp; + p[i++] = 0xe0; + p[i++] = 0xc1; // shr $0x11,%eax // sp >> 17; + p[i++] = 0xe8; + p[i++] = 0x11; + p[i++] = 0x89; // mov %eax,%edx // key = sp >> 17; + p[i++] = 0xc2; + p[i++] = 0xc1; // sar $0x7,%edx // key >> 7; + p[i++] = 0xfa; + p[i++] = 0x07; + p[i++] = 0x29; // sub %edx,%eax // key -= key >> 7; + p[i++] = 0xd0; + p[i++] = 0x89; // mov %eax,%edx + p[i++] = 0xc2; + p[i++] = 0xc1; // sar $0x5,%edx // key >> 5; + p[i++] = 0xfa; + p[i++] = 0x05; + p[i++] = 0x29; // sub %edx,%eax // key -= key >> 5; + p[i++] = 0xd0; + p[i++] = 0x89; // mov %eax,%edx + p[i++] = 0xc2; + p[i++] = 0xc1; // sar $0x3,%edx // key >> 3; + p[i++] = 0xfa; + p[i++] = 0x03; + p[i++] = 0x29; // sub %edx,%eax // key -= key >> 3; + p[i++] = 0xd0; + p[i++] = 0x25; // and $0xff,%eax // key &= 0xFF; + p[i++] = 0xff; + p[i++] = 0x00; + p[i++] = 0x00; + p[i++] = 0x00; + p[i++] = 0x8b; // mov (flush_counter),%ecx // %ecx = counter = flush_counter; + p[i++] = 0x0d; + *((DWORD*) &p[i]) = (DWORD)&flush_counter; + i += sizeof(DWORD); + p[i++] = 0x8b; // mov (thread_hints,%eax,4),%eax // %edx = pThread = thread_hints[key]; + p[i++] = 0x14; + p[i++] = 0x85; + *((DWORD*) &p[i]) = (DWORD)&thread_hints; + i += sizeof(DWORD); + p[i++] = 0x39; // cmp %esp,offsetof(CPalThread,tlsInfo)+offsetof(CThreadTLSInfo,minStack)(%edx) + // if ((size_t)pThread->tlsInfo.minStack <= sp) + p[i++] = 0xa2; + *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,minStack)); + i += sizeof(DWORD); + p[i++] = 0x77; // ja CallInternalGetCurrentThreadSlow: + p[i++] = 0x19; + p[i++] = 0x3b; // cmp offsetof(CPalThread,tlsInfo)+offsetof(CThreadTLSInfo,maxStack)(%edx),%esp + // if (sp < (size_t)pThread->tlsInfo.maxStack) + p[i++] = 0xa2; + *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,maxStack)); + i += sizeof(DWORD); + p[i++] = 0x73; // jae CallInternalGetCurrentThreadSlow: + p[i++] = 0x11; + p[i++] = 0x39; // cmp (flush_counter),%ecx // if (counter == flush_counter) + p[i++] = 0x0d; + *((DWORD*) &p[i]) = (DWORD)&flush_counter; + i += sizeof(DWORD); + p[i++] = 0x75; // jne CallInternalGetCurrentThreadSlow: + p[i++] = 0x09; + if (dwTlsIndex != THREAD_OBJECT_TLS_INDEX) + { + p[i++] = 0x8b; // mov offsetof(pThread->tlsSlots[dwTlsIndex])(%edx),%eax // %eax = pThread->tlsSlots[dwTlsIndex]; + p[i++] = 0x82; + *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,tlsSlots[dwTlsIndex])); + i += sizeof(DWORD); + } + else + { + p[i++] = 0x89; // mov %edx,%eax // %eax = pThread; + p[i++] = 0xd0; + p[i++] = 0x90; // nop + p[i++] = 0x90; // nop + p[i++] = 0x90; // nop + p[i++] = 0x90; // nop + } + p[i++] = 0x5a; // pop %edx + p[i++] = 0x59; // pop %ecx + p[i++] = 0xc3; // ret + // CallInternalGetCurrentThreadSlow: + p[i++] = 0x5a; // pop %edx + p[i++] = 0x59; // pop %ecx + p[i++] = 0x8d; // lea (thread_hints,%eax,4),%eax // %eax = &thread_hints[key]; + p[i++] = 0x04; + p[i++] = 0x85; + *((DWORD*) &p[i]) = (DWORD)&thread_hints; + i += sizeof(DWORD); + p[i++] = 0x55; // push %ebp + p[i++] = 0x89; // mov %esp,%ebp + p[i++] = 0xe5; + p[i++] = 0x51; // push %ecx + p[i++] = 0x89; // mov %esp,%ecx // this is the reference esp - need to match the reference esp used in the fast path. + p[i++] = 0xe1; + p[i++] = 0x52; // push %edx +#ifdef __APPLE__ + // establish 16-byte stack alignment + p[i++] = 0x83; // subl $8,%esp + p[i++] = 0xec; + p[i++] = 0x08; +#endif + p[i++] = 0x50; // push %eax // store &thread_hints[key] on stack as 2nd argument; + p[i++] = 0x51; // push %ecx // reference esp - The 1st argument for call to InternalGetCurrentThreadSlow. + p[i++] = 0xe8; // call InternalGetCurrentThreadSlow + *((DWORD*) &p[i]) = (DWORD)&InternalGetCurrentThreadSlow - (DWORD)(&p[i+sizeof(DWORD)]); + i += sizeof(DWORD); +#ifdef __APPLE__ + p[i++] = 0x83; // addl $16,%esp + p[i++] = 0xc4; + p[i++] = 0x10; +#else + p[i++] = 0x83; // addl $8,%esp + p[i++] = 0xc4; + p[i++] = 0x08; +#endif + if (dwTlsIndex != THREAD_OBJECT_TLS_INDEX) + { + p[i++] = 0x8b; // mov offsetof(pThread->tlsSlots[dwTlsIndex])(%eax),%eax // %eax = pThread->tlsSlots[dwTlsIndex]; + p[i++] = 0x80; + *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,tlsSlots[dwTlsIndex])); + i += sizeof(DWORD); + } + p[i++] = 0x5a; // pop %edx + p[i++] = 0x59; // pop %ecx + p[i++] = 0xc9; // leave + p[i++] = 0xc3; // ret + + if (i > TLS_OPTIMIZED_GETTER_SIZE) + { + ASSERT("Invalid TLS_OPTIMIZED_GETTER_SIZE %d\n", i); + } + + DBG_FlushInstructionCache(p, TLS_OPTIMIZED_GETTER_SIZE * sizeof(BYTE)); + + Ret = (PAL_POPTIMIZEDTLSGETTER)p; + + return Ret; +#endif // BIT64 else +} + +/*++ +Function: + TLSFreeOptimizedGetter + + Frees a function created by MakeOptimizedTlsGetter(). +--*/ +VOID +CorUnix::TLSFreeOptimizedGetter( + IN PAL_POPTIMIZEDTLSGETTER pOptimizedTlsGetter) +{ + InternalFree(InternalGetCurrentThread(), (void *)pOptimizedTlsGetter); +} + +#endif // USE_OPTIMIZEDTLSGETTER |