// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. /* TLS.CPP: * * * Encapsulates TLS access for maximum performance. * */ #include "stdafx.h" #include "unsafe.h" #include "tls.h" #include "contract.h" #include "corerror.h" #include "ex.h" #include "clrhost.h" #ifndef SELF_NO_HOST #include "clrconfig.h" #endif #include "clrnt.h" #ifndef SELF_NO_HOST //--------------------------------------------------------------------------- // Win95 and WinNT store the TLS in different places relative to the // fs:[0]. This api reveals which. Can also return TLSACCESS_GENERIC if // no info is available about the Thread location (you have to use the TlsGetValue // api.) This is intended for use by stub generators that want to inline TLS // access. //--------------------------------------------------------------------------- TLSACCESSMODE GetTLSAccessMode(DWORD tlsIndex) { // Static contracts because this is used by contract infrastructure STATIC_CONTRACT_NOTHROW; STATIC_CONTRACT_GC_NOTRIGGER; TLSACCESSMODE tlsAccessMode = TLSACCESS_GENERIC; #ifdef _DEBUG // Debug builds allow user to throw a switch to force use of the generic // (non-optimized) Thread/AppDomain getters. Even if the user doesn't throw // the switch, force tests to go down the generic getter code path about 1% of the // time so it's exercised a couple dozen times during each devbvt run. if ((CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_UseGenericTlsGetters) != 0) || DbgRandomOnExe(.01)) return TLSACCESS_GENERIC; #endif if (tlsIndex < TLS_MINIMUM_AVAILABLE) { tlsAccessMode = TLSACCESS_WNT; } else if (tlsIndex < (TLS_MINIMUM_AVAILABLE + TLS_EXPANSION_SLOTS)) { // Expansion slots are lazily created at the first call to // TlsGetValue on a thread, and the code we generate // assumes that the expansion slots will exist. // // On newer flavors of NT we could use the vectored // exception handler to take the AV, call TlsGetValue, and // resume execution at the start of the getter. tlsAccessMode = TLSACCESS_GENERIC;//TLSACCESS_WNT_HIGH; } else { // // If the app verifier is enabled, TLS indices // are faked to help detect invalid handle use. // } return tlsAccessMode; } //--------------------------------------------------------------------------- // Creates a platform-optimized version of TlsGetValue compiled // for a particular index. Can return NULL. //--------------------------------------------------------------------------- // A target for the optimized getter can be passed in, this is // useful so that code can avoid an indirect call for the GetThread // and GetAppDomain calls for instance. If NULL is passed then // we will allocate from the executeable heap. POPTIMIZEDTLSGETTER MakeOptimizedTlsGetter(DWORD tlsIndex, LPVOID pBuffer, SIZE_T cbBuffer, POPTIMIZEDTLSGETTER pGenericImpl, BOOL fForceGeneric) { // Static contracts because this is used by contract infrastructure STATIC_CONTRACT_NOTHROW; STATIC_CONTRACT_GC_NOTRIGGER; ARM_ONLY(pBuffer = ThumbCodeToDataPointer(pBuffer)); // Buffer that should be big enough to encode the TLS getter on any reasonable platform TADDR patch[4 INDEBUG(+4 /* last error trashing */)]; PBYTE pPatch = (PBYTE)&patch; TLSACCESSMODE mode = fForceGeneric ? TLSACCESS_GENERIC : GetTLSAccessMode(tlsIndex); #if defined(_DEBUG) if (mode != TLSACCESS_GENERIC) { // // Trash last error in debug builds // #ifdef _TARGET_X86_ *((DWORD*) (pPatch + 0)) = 0x05c764; // mov dword ptr fs:[offsetof(TEB, LastErrorValue)], LAST_ERROR_TRASH_VALUE *((DWORD*) (pPatch + 3)) = offsetof(TEB, LastErrorValue); *((DWORD*) (pPatch + 7)) = LAST_ERROR_TRASH_VALUE; pPatch += 11; #endif // _TARGET_X86_ #ifdef _TARGET_AMD64_ // iDNA doesn't like writing directly to gs:[nn] *((UINT64*)(pPatch + 0)) = 0x25048b4865; // mov rax, gs:[offsetof(TEB, NtTib.Self)] *((DWORD*) (pPatch + 5)) = offsetof(TEB, NtTib.Self); *((WORD*) (pPatch + 9)) = 0x80c7; // mov dword ptr [rax + offsetof(TEB, LastErrorValue)], LAST_ERROR_TRASH_VALUE *((DWORD*) (pPatch + 11)) = offsetof(TEB, LastErrorValue); *((DWORD*) (pPatch + 15)) = LAST_ERROR_TRASH_VALUE; pPatch += 19; #endif } #endif // _DEBUG switch (mode) { #ifdef _TARGET_X86_ case TLSACCESS_WNT: *((WORD*) (pPatch + 0)) = 0xa164; // mov eax, fs:[IMM32] *((DWORD*) (pPatch + 2)) = offsetof(TEB, TlsSlots) + tlsIndex * sizeof(void*); *((BYTE*) (pPatch + 6)) = 0xc3; // retn pPatch += 7; break; case TLSACCESS_GENERIC: if (pGenericImpl == NULL) return NULL; _ASSERTE(pBuffer != NULL); *((BYTE*) (pPatch + 0)) = 0xE9; // jmp pGenericImpl TADDR rel32 = ((TADDR)pGenericImpl - ((TADDR)pBuffer + 1 + sizeof(INT32))); *((INT32*) (pPatch + 1)) = (INT32)rel32; pPatch += 5; break; #endif // _TARGET_X86_ #ifdef _TARGET_AMD64_ case TLSACCESS_WNT: *((UINT64*)(pPatch + 0)) = 0x25048b4865; // mov rax, gs:[IMM32] *((DWORD*) (pPatch + 5)) = offsetof(TEB, TlsSlots) + (tlsIndex * sizeof(void*)); *((BYTE*) (pPatch + 9)) = 0xc3; // return pPatch += 10; break; case TLSACCESS_GENERIC: if (pGenericImpl == NULL) return NULL; _ASSERTE(pBuffer != NULL); *((BYTE*) (pPatch + 0)) = 0xE9; // jmp pGenericImpl TADDR rel32 = ((TADDR)pGenericImpl - ((TADDR)pBuffer + 1 + sizeof(INT32))); _ASSERTE((INT64)(INT32)rel32 == (INT64)rel32); *((INT32*) (pPatch + 1)) = (INT32)rel32; pPatch += 5; *pPatch++ = 0xCC; // Make sure there is full 8 bytes worth of data *pPatch++ = 0xCC; *pPatch++ = 0xCC; break; #endif // _TARGET_AMD64_ #ifdef _TARGET_ARM_ case TLSACCESS_WNT: { WORD slotOffset = (WORD)(offsetof(TEB, TlsSlots) + tlsIndex * sizeof(void*)); _ASSERTE(slotOffset < 4096); WORD *pInstr = (WORD*)pPatch; *pInstr++ = 0xee1d; // mrc p15, 0, r0, c13, c0, 2 *pInstr++ = 0x0f50; *pInstr++ = 0xf8d0; // ldr r0, [r0, #slotOffset] *pInstr++ = slotOffset; *pInstr++ = 0x4770; // bx lr pPatch = (PBYTE)pInstr; } break; case TLSACCESS_GENERIC: { if (pGenericImpl == NULL) return NULL; _ASSERTE(pBuffer != NULL); *(DWORD *)pPatch = 0x9000F000; // b pGenericImpl PutThumb2BlRel24((WORD*)pPatch, (TADDR)pGenericImpl - ((TADDR)pBuffer + 4 + THUMB_CODE)); pPatch += 4; } break; #endif // _TARGET_ARM_ } SIZE_T cbCode = (TADDR)pPatch - (TADDR)&patch; _ASSERTE(cbCode <= sizeof(patch)); if (pBuffer != NULL) { _ASSERTE_ALL_BUILDS("clr/src/utilcode/tls.cpp", cbCode <= cbBuffer); // We assume that the first instruction of the buffer is a short jump to dummy helper // that can be atomically overwritten to avoid races with other threads executing the code. // It is the same basic technique as hot patching. // Assert on all builds to make sure that retail optimizations are not affecting the alignment. _ASSERTE_ALL_BUILDS("clr/src/utilcode/tls.cpp", IS_ALIGNED((void*)pBuffer, sizeof(TADDR))); // Size of short jump that gets patched last. if (cbCode > sizeof(TADDR)) { memcpy((BYTE *)pBuffer + sizeof(TADDR), &patch[1], cbCode - sizeof(TADDR)); FlushInstructionCache(GetCurrentProcess(), (BYTE *)pBuffer + sizeof(TADDR), cbCode - sizeof(TADDR)); } // Make sure that the the dummy implementation still works. _ASSERTE(((POPTIMIZEDTLSGETTER)ARM_ONLY(DataPointerToThumbCode)(pBuffer))() == NULL); // It is important for this write to happen atomically VolatileStore((TADDR *)pBuffer, patch[0]); FlushInstructionCache(GetCurrentProcess(), (BYTE *)pBuffer, sizeof(TADDR)); } else { pBuffer = (BYTE*) new (executable, nothrow) BYTE[cbCode]; if (pBuffer == NULL) return NULL; memcpy(pBuffer, &patch, cbCode); FlushInstructionCache(GetCurrentProcess(), pBuffer, cbCode); } return (POPTIMIZEDTLSGETTER)ARM_ONLY(DataPointerToThumbCode)(pBuffer); } //--------------------------------------------------------------------------- // Frees a function created by MakeOptimizedTlsGetter(). //--------------------------------------------------------------------------- VOID FreeOptimizedTlsGetter(POPTIMIZEDTLSGETTER pOptimizedTlsGetter) { // Static contracts because this is used by contract infrastructure STATIC_CONTRACT_NOTHROW; STATIC_CONTRACT_GC_NOTRIGGER; BYTE* pGetter = (BYTE*)pOptimizedTlsGetter; #ifdef _TARGET_ARM_ pGetter = ThumbCodeToDataPointer(pGetter); #endif DeleteExecutable(pGetter); } #endif // !SELF_NO_HOST