summaryrefslogtreecommitdiff
path: root/src/pal/src/arch/amd64/optimizedtls.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/pal/src/arch/amd64/optimizedtls.cpp')
-rw-r--r--src/pal/src/arch/amd64/optimizedtls.cpp237
1 files changed, 237 insertions, 0 deletions
diff --git a/src/pal/src/arch/amd64/optimizedtls.cpp b/src/pal/src/arch/amd64/optimizedtls.cpp
new file mode 100644
index 0000000000..cd89db6b0a
--- /dev/null
+++ b/src/pal/src/arch/amd64/optimizedtls.cpp
@@ -0,0 +1,237 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+/*++
+
+
+
+Module Name:
+
+ optimizedtls.cpp
+
+Abstract:
+
+ Implementation of platform-specific Thread local storage functions.
+
+
+
+--*/
+
+#include "pal/thread.hpp"
+#include "pal/malloc.hpp"
+
+#include <pthread.h>
+
+#include "pal/dbgmsg.h"
+#include "pal/misc.h"
+#include "pal/debug.h"
+
+#include <stddef.h>
+
+using namespace CorUnix;
+
+SET_DEFAULT_DEBUG_CHANNEL(THREAD);
+
+#if defined(USE_OPTIMIZEDTLSGETTER)
+
+#define PAL_safe_offsetof(s,m) ((size_t)((ptrdiff_t)&(char&)(((s *)64)->m))-64)
+
+/*++
+Function:
+ CorUnix::TLSMakeOptimizedGetter
+
+ Creates a platform-optimized version of TlsGetValue compiled
+ for a particular index.
+
+ Generates the hot part of CorUnix::InternalGetCurrentThread
+ as a chunk of highly optimized machine-specific code at runtime.
+
+ Check the difference between CorUnix::InternalGetCurrentThread and
+ CorUnix::InternalGetCurrentThreadSlow to see the C/C++ code that matches
+ the code generated by this function.
+--*/
+PAL_POPTIMIZEDTLSGETTER
+CorUnix::TLSMakeOptimizedGetter(
+ IN CPalThread* pThread,
+ IN DWORD dwTlsIndex)
+{
+#ifdef BIT64
+#pragma unused(pThread, dwTlsIndex)
+ ERROR("TLSMakeOptimizedGetter not rewritten for amd64 yet.");
+ return NULL;
+#else
+ PAL_POPTIMIZEDTLSGETTER Ret = NULL;
+ BYTE* p;
+ int i = 0;
+
+#ifdef __APPLE__
+#define TLS_OPTIMIZED_GETTER_SIZE 118
+#else
+#define TLS_OPTIMIZED_GETTER_SIZE 115
+#endif
+
+ p = (BYTE*)InternalMalloc(pThread, TLS_OPTIMIZED_GETTER_SIZE * sizeof(BYTE));
+
+ if (p == NULL)
+ {
+ return Ret;
+ }
+
+ // Need to preserve %ecx, %edx, and %esi registers as specified in
+ // GetThreadGeneric(void) in vm/amd64/asmhelpers.s
+ p[i++] = 0x51; // push %ecx
+ p[i++] = 0x52; // push %edx
+ p[i++] = 0x89; // mov %esp,%eax // %eax = sp;
+ p[i++] = 0xe0;
+ p[i++] = 0xc1; // shr $0x11,%eax // sp >> 17;
+ p[i++] = 0xe8;
+ p[i++] = 0x11;
+ p[i++] = 0x89; // mov %eax,%edx // key = sp >> 17;
+ p[i++] = 0xc2;
+ p[i++] = 0xc1; // sar $0x7,%edx // key >> 7;
+ p[i++] = 0xfa;
+ p[i++] = 0x07;
+ p[i++] = 0x29; // sub %edx,%eax // key -= key >> 7;
+ p[i++] = 0xd0;
+ p[i++] = 0x89; // mov %eax,%edx
+ p[i++] = 0xc2;
+ p[i++] = 0xc1; // sar $0x5,%edx // key >> 5;
+ p[i++] = 0xfa;
+ p[i++] = 0x05;
+ p[i++] = 0x29; // sub %edx,%eax // key -= key >> 5;
+ p[i++] = 0xd0;
+ p[i++] = 0x89; // mov %eax,%edx
+ p[i++] = 0xc2;
+ p[i++] = 0xc1; // sar $0x3,%edx // key >> 3;
+ p[i++] = 0xfa;
+ p[i++] = 0x03;
+ p[i++] = 0x29; // sub %edx,%eax // key -= key >> 3;
+ p[i++] = 0xd0;
+ p[i++] = 0x25; // and $0xff,%eax // key &= 0xFF;
+ p[i++] = 0xff;
+ p[i++] = 0x00;
+ p[i++] = 0x00;
+ p[i++] = 0x00;
+ p[i++] = 0x8b; // mov (flush_counter),%ecx // %ecx = counter = flush_counter;
+ p[i++] = 0x0d;
+ *((DWORD*) &p[i]) = (DWORD)&flush_counter;
+ i += sizeof(DWORD);
+ p[i++] = 0x8b; // mov (thread_hints,%eax,4),%eax // %edx = pThread = thread_hints[key];
+ p[i++] = 0x14;
+ p[i++] = 0x85;
+ *((DWORD*) &p[i]) = (DWORD)&thread_hints;
+ i += sizeof(DWORD);
+ p[i++] = 0x39; // cmp %esp,offsetof(CPalThread,tlsInfo)+offsetof(CThreadTLSInfo,minStack)(%edx)
+ // if ((size_t)pThread->tlsInfo.minStack <= sp)
+ p[i++] = 0xa2;
+ *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,minStack));
+ i += sizeof(DWORD);
+ p[i++] = 0x77; // ja CallInternalGetCurrentThreadSlow:
+ p[i++] = 0x19;
+ p[i++] = 0x3b; // cmp offsetof(CPalThread,tlsInfo)+offsetof(CThreadTLSInfo,maxStack)(%edx),%esp
+ // if (sp < (size_t)pThread->tlsInfo.maxStack)
+ p[i++] = 0xa2;
+ *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,maxStack));
+ i += sizeof(DWORD);
+ p[i++] = 0x73; // jae CallInternalGetCurrentThreadSlow:
+ p[i++] = 0x11;
+ p[i++] = 0x39; // cmp (flush_counter),%ecx // if (counter == flush_counter)
+ p[i++] = 0x0d;
+ *((DWORD*) &p[i]) = (DWORD)&flush_counter;
+ i += sizeof(DWORD);
+ p[i++] = 0x75; // jne CallInternalGetCurrentThreadSlow:
+ p[i++] = 0x09;
+ if (dwTlsIndex != THREAD_OBJECT_TLS_INDEX)
+ {
+ p[i++] = 0x8b; // mov offsetof(pThread->tlsSlots[dwTlsIndex])(%edx),%eax // %eax = pThread->tlsSlots[dwTlsIndex];
+ p[i++] = 0x82;
+ *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,tlsSlots[dwTlsIndex]));
+ i += sizeof(DWORD);
+ }
+ else
+ {
+ p[i++] = 0x89; // mov %edx,%eax // %eax = pThread;
+ p[i++] = 0xd0;
+ p[i++] = 0x90; // nop
+ p[i++] = 0x90; // nop
+ p[i++] = 0x90; // nop
+ p[i++] = 0x90; // nop
+ }
+ p[i++] = 0x5a; // pop %edx
+ p[i++] = 0x59; // pop %ecx
+ p[i++] = 0xc3; // ret
+ // CallInternalGetCurrentThreadSlow:
+ p[i++] = 0x5a; // pop %edx
+ p[i++] = 0x59; // pop %ecx
+ p[i++] = 0x8d; // lea (thread_hints,%eax,4),%eax // %eax = &thread_hints[key];
+ p[i++] = 0x04;
+ p[i++] = 0x85;
+ *((DWORD*) &p[i]) = (DWORD)&thread_hints;
+ i += sizeof(DWORD);
+ p[i++] = 0x55; // push %ebp
+ p[i++] = 0x89; // mov %esp,%ebp
+ p[i++] = 0xe5;
+ p[i++] = 0x51; // push %ecx
+ p[i++] = 0x89; // mov %esp,%ecx // this is the reference esp - need to match the reference esp used in the fast path.
+ p[i++] = 0xe1;
+ p[i++] = 0x52; // push %edx
+#ifdef __APPLE__
+ // establish 16-byte stack alignment
+ p[i++] = 0x83; // subl $8,%esp
+ p[i++] = 0xec;
+ p[i++] = 0x08;
+#endif
+ p[i++] = 0x50; // push %eax // store &thread_hints[key] on stack as 2nd argument;
+ p[i++] = 0x51; // push %ecx // reference esp - The 1st argument for call to InternalGetCurrentThreadSlow.
+ p[i++] = 0xe8; // call InternalGetCurrentThreadSlow
+ *((DWORD*) &p[i]) = (DWORD)&InternalGetCurrentThreadSlow - (DWORD)(&p[i+sizeof(DWORD)]);
+ i += sizeof(DWORD);
+#ifdef __APPLE__
+ p[i++] = 0x83; // addl $16,%esp
+ p[i++] = 0xc4;
+ p[i++] = 0x10;
+#else
+ p[i++] = 0x83; // addl $8,%esp
+ p[i++] = 0xc4;
+ p[i++] = 0x08;
+#endif
+ if (dwTlsIndex != THREAD_OBJECT_TLS_INDEX)
+ {
+ p[i++] = 0x8b; // mov offsetof(pThread->tlsSlots[dwTlsIndex])(%eax),%eax // %eax = pThread->tlsSlots[dwTlsIndex];
+ p[i++] = 0x80;
+ *((DWORD*) &p[i]) = (DWORD)(PAL_safe_offsetof(CPalThread,tlsInfo)+PAL_safe_offsetof(CThreadTLSInfo,tlsSlots[dwTlsIndex]));
+ i += sizeof(DWORD);
+ }
+ p[i++] = 0x5a; // pop %edx
+ p[i++] = 0x59; // pop %ecx
+ p[i++] = 0xc9; // leave
+ p[i++] = 0xc3; // ret
+
+ if (i > TLS_OPTIMIZED_GETTER_SIZE)
+ {
+ ASSERT("Invalid TLS_OPTIMIZED_GETTER_SIZE %d\n", i);
+ }
+
+ DBG_FlushInstructionCache(p, TLS_OPTIMIZED_GETTER_SIZE * sizeof(BYTE));
+
+ Ret = (PAL_POPTIMIZEDTLSGETTER)p;
+
+ return Ret;
+#endif // BIT64 else
+}
+
+/*++
+Function:
+ TLSFreeOptimizedGetter
+
+ Frees a function created by MakeOptimizedTlsGetter().
+--*/
+VOID
+CorUnix::TLSFreeOptimizedGetter(
+ IN PAL_POPTIMIZEDTLSGETTER pOptimizedTlsGetter)
+{
+ InternalFree(InternalGetCurrentThread(), (void *)pOptimizedTlsGetter);
+}
+
+#endif // USE_OPTIMIZEDTLSGETTER