diff options
Diffstat (limited to 'src/vm/amd64/JitHelpers_Slow.asm')
-rw-r--r-- | src/vm/amd64/JitHelpers_Slow.asm | 1830 |
1 files changed, 1830 insertions, 0 deletions
diff --git a/src/vm/amd64/JitHelpers_Slow.asm b/src/vm/amd64/JitHelpers_Slow.asm new file mode 100644 index 0000000000..7deed49d98 --- /dev/null +++ b/src/vm/amd64/JitHelpers_Slow.asm @@ -0,0 +1,1830 @@ +; Licensed to the .NET Foundation under one or more agreements. +; The .NET Foundation licenses this file to you under the MIT license. +; See the LICENSE file in the project root for more information. + +; ==++== +; + +; +; ==--== +; *********************************************************************** +; File: JitHelpers_Slow.asm, see history in jithelp.asm +; +; Notes: These are ASM routinues which we believe to be cold in normal +; AMD64 scenarios, mainly because they have other versions which +; have some more performant nature which will be used in the best +; cases. +; *********************************************************************** + +include AsmMacros.inc +include asmconstants.inc + +; Min amount of stack space that a nested function should allocate. +MIN_SIZE equ 28h + +EXTERN g_ephemeral_low:QWORD +EXTERN g_ephemeral_high:QWORD +EXTERN g_lowest_address:QWORD +EXTERN g_highest_address:QWORD +EXTERN g_card_table:QWORD + +ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP +EXTERN g_sw_ww_table:QWORD +EXTERN g_sw_ww_enabled_for_gc_heap:BYTE +endif + +ifdef WRITE_BARRIER_CHECK +; Those global variables are always defined, but should be 0 for Server GC +g_GCShadow TEXTEQU <?g_GCShadow@@3PEAEEA> +g_GCShadowEnd TEXTEQU <?g_GCShadowEnd@@3PEAEEA> +EXTERN g_GCShadow:QWORD +EXTERN g_GCShadowEnd:QWORD +endif + +JIT_NEW equ ?JIT_New@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@@Z +Object__DEBUG_SetAppDomain equ ?DEBUG_SetAppDomain@Object@@QEAAXPEAVAppDomain@@@Z +CopyValueClassUnchecked equ ?CopyValueClassUnchecked@@YAXPEAX0PEAVMethodTable@@@Z +JIT_Box equ ?JIT_Box@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@PEAX@Z +g_pStringClass equ ?g_pStringClass@@3PEAVMethodTable@@EA +FramedAllocateString equ ?FramedAllocateString@@YAPEAVStringObject@@K@Z +JIT_NewArr1 equ ?JIT_NewArr1@@YAPEAVObject@@PEAUCORINFO_CLASS_STRUCT_@@_J@Z + +INVALIDGCVALUE equ 0CCCCCCCDh + +extern JIT_NEW:proc +extern CopyValueClassUnchecked:proc +extern JIT_Box:proc +extern g_pStringClass:QWORD +extern FramedAllocateString:proc +extern JIT_NewArr1:proc + +extern JIT_GetSharedNonGCStaticBase_Helper:proc +extern JIT_GetSharedGCStaticBase_Helper:proc + +extern JIT_InternalThrow:proc + +ifdef _DEBUG +; Version for when we're sure to be in the GC, checks whether or not the card +; needs to be updated +; +; void JIT_WriteBarrier_Debug(Object** dst, Object* src) +LEAF_ENTRY JIT_WriteBarrier_Debug, _TEXT + +ifdef WRITE_BARRIER_CHECK + ; **ALSO update the shadow GC heap if that is enabled** + ; Do not perform the work if g_GCShadow is 0 + cmp g_GCShadow, 0 + je NoShadow + + ; If we end up outside of the heap don't corrupt random memory + mov r10, rcx + sub r10, [g_lowest_address] + jb NoShadow + + ; Check that our adjusted destination is somewhere in the shadow gc + add r10, [g_GCShadow] + cmp r10, [g_GCShadowEnd] + ja NoShadow + + ; Write ref into real GC; see comment below about possibility of AV + mov [rcx], rdx + ; Write ref into shadow GC + mov [r10], rdx + + ; Ensure that the write to the shadow heap occurs before the read from + ; the GC heap so that race conditions are caught by INVALIDGCVALUE + mfence + + ; Check that GC/ShadowGC values match + mov r11, [rcx] + mov rax, [r10] + cmp rax, r11 + je DoneShadow + mov r11, INVALIDGCVALUE + mov [r10], r11 + + jmp DoneShadow + + ; If we don't have a shadow GC we won't have done the write yet + NoShadow: +endif + + mov rax, rdx + + ; Do the move. It is correct to possibly take an AV here, the EH code + ; figures out that this came from a WriteBarrier and correctly maps it back + ; to the managed method which called the WriteBarrier (see setup in + ; InitializeExceptionHandling, vm\exceptionhandling.cpp). + mov [rcx], rax + +ifdef WRITE_BARRIER_CHECK + ; If we had a shadow GC then we already wrote to the real GC at the same time + ; as the shadow GC so we want to jump over the real write immediately above + DoneShadow: +endif + +ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP + ; Update the write watch table if necessary + cmp byte ptr [g_sw_ww_enabled_for_gc_heap], 0h + je CheckCardTable + mov r10, rcx + shr r10, 0Ch ; SoftwareWriteWatch::AddressToTableByteIndexShift + add r10, qword ptr [g_sw_ww_table] + cmp byte ptr [r10], 0h + jne CheckCardTable + mov byte ptr [r10], 0FFh +endif + + CheckCardTable: + ; See if we can just quick out + cmp rax, [g_ephemeral_low] + jb Exit + cmp rax, [g_ephemeral_high] + jnb Exit + + ; Check if we need to update the card table + ; Calc pCardByte + shr rcx, 0Bh + add rcx, [g_card_table] + + ; Check if this card is dirty + cmp byte ptr [rcx], 0FFh + jne UpdateCardTable + REPRET + + UpdateCardTable: + mov byte ptr [rcx], 0FFh + ret + + align 16 + Exit: + REPRET +LEAF_END_MARKED JIT_WriteBarrier_Debug, _TEXT +endif + +NESTED_ENTRY JIT_TrialAllocSFastMP, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + CALL_GETTHREAD + mov r11, rax + + mov r8d, [rcx + OFFSET__MethodTable__m_BaseSize] + + ; m_BaseSize is guaranteed to be a multiple of 8. + + mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] + mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 + mov [rax], rcx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain +endif ; _DEBUG + + ; epilog + add rsp, MIN_SIZE + ret + + AllocFailed: + add rsp, MIN_SIZE + jmp JIT_NEW +NESTED_END JIT_TrialAllocSFastMP, _TEXT + + +; HCIMPL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* unboxedData) +NESTED_ENTRY JIT_BoxFastMP, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + mov rax, [rcx + OFFSETOF__MethodTable__m_pWriteableData] + + ; Check whether the class has not been initialized + test dword ptr [rax + OFFSETOF__MethodTableWriteableData__m_dwFlags], MethodTableWriteableData__enum_flag_Unrestored + jnz ClassNotInited + + CALL_GETTHREAD + mov r11, rax + + mov r8d, [rcx + OFFSET__MethodTable__m_BaseSize] + + ; m_BaseSize is guaranteed to be a multiple of 8. + + mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] + mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 + mov [rax], rcx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain +endif ; _DEBUG + + ; Check whether the object contains pointers + test dword ptr [rcx + OFFSETOF__MethodTable__m_dwFlags], MethodTable__enum_flag_ContainsPointers + jnz ContainsPointers + + ; We have no pointers - emit a simple inline copy loop + + mov ecx, [rcx + OFFSET__MethodTable__m_BaseSize] + sub ecx, 18h ; sizeof(ObjHeader) + sizeof(Object) + last slot + + CopyLoop: + mov r8, [rdx+rcx] + mov [rax+rcx+8], r8 + + sub ecx, 8 + jge CopyLoop + + add rsp, MIN_SIZE + ret + + ContainsPointers: + ; Do call to CopyValueClassUnchecked(object, data, pMT) + + mov [rsp+20h], rax + + mov r8, rcx + lea rcx, [rax + 8] + call CopyValueClassUnchecked + + mov rax, [rsp+20h] + + add rsp, MIN_SIZE + ret + + ClassNotInited: + AllocFailed: + add rsp, MIN_SIZE + jmp JIT_Box +NESTED_END JIT_BoxFastMP, _TEXT + + +NESTED_ENTRY AllocateStringFastMP, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; Instead of doing elaborate overflow checks, we just limit the number of elements + ; to (LARGE_OBJECT_SIZE - 256)/sizeof(WCHAR) or less. + ; This will avoid all overflow problems, as well as making sure + ; big string objects are correctly allocated in the big object heap. + + cmp ecx, (ASM_LARGE_OBJECT_SIZE - 256)/2 + jae OversizedString + + CALL_GETTHREAD + mov r11, rax + + mov rdx, [g_pStringClass] + mov r8d, [rdx + OFFSET__MethodTable__m_BaseSize] + + ; Calculate the final size to allocate. + ; We need to calculate baseSize + cnt*2, then round that up by adding 7 and anding ~7. + + lea r8d, [r8d + ecx*2 + 7] + and r8d, -8 + + mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] + mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 + mov [rax], rdx + + mov [rax + OFFSETOF__StringObject__m_StringLength], ecx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain +endif ; _DEBUG + + add rsp, MIN_SIZE + ret + + OversizedString: + AllocFailed: + add rsp, MIN_SIZE + jmp FramedAllocateString +NESTED_END AllocateStringFastMP, _TEXT + +FIX_INDIRECTION macro Reg +ifdef FEATURE_PREJIT + test Reg, 1 + jz @F + mov Reg, [Reg-1] + @@: +endif +endm + +; HCIMPL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size) +NESTED_ENTRY JIT_NewArr1VC_MP, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; We were passed a type descriptor in RCX, which contains the (shared) + ; array method table and the element type. + + ; The element count is in RDX + + ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need + ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray. + + ; Do a conservative check here. This is to avoid overflow while doing the calculations. We don't + ; have to worry about "large" objects, since the allocation quantum is never big enough for + ; LARGE_OBJECT_SIZE. + + ; For Value Classes, this needs to be 2^16 - slack (2^32 / max component size), + ; The slack includes the size for the array header and round-up ; for alignment. Use 256 for the + ; slack value out of laziness. + + ; In both cases we do a final overflow check after adding to the alloc_ptr. + + CALL_GETTHREAD + mov r11, rax + + ; we need to load the true method table from the type desc + mov r9, [rcx + OFFSETOF__ArrayTypeDesc__m_TemplateMT - 2] + + FIX_INDIRECTION r9 + + cmp rdx, (65535 - 256) + jae OversizedArray + + movzx r8d, word ptr [r9 + OFFSETOF__MethodTable__m_dwFlags] ; component size is low 16 bits + imul r8d, edx ; signed mul, but won't overflow due to length restriction above + add r8d, dword ptr [r9 + OFFSET__MethodTable__m_BaseSize] + + ; round the size to a multiple of 8 + + add r8d, 7 + and r8d, -8 + + mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] + mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] + + add r8, rax + jc AllocFailed + + cmp r8, r10 + ja AllocFailed + + mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 + mov [rax], r9 + + mov dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain +endif ; _DEBUG + + add rsp, MIN_SIZE + ret + + OversizedArray: + AllocFailed: + add rsp, MIN_SIZE + jmp JIT_NewArr1 +NESTED_END JIT_NewArr1VC_MP, _TEXT + + +; HCIMPL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size) +NESTED_ENTRY JIT_NewArr1OBJ_MP, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; We were passed a type descriptor in RCX, which contains the (shared) + ; array method table and the element type. + + ; The element count is in RDX + + ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need + ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray. + + ; Verifies that LARGE_OBJECT_SIZE fits in 32-bit. This allows us to do array size + ; arithmetic using 32-bit registers. + .erre ASM_LARGE_OBJECT_SIZE lt 100000000h + + cmp rdx, (ASM_LARGE_OBJECT_SIZE - 256)/8 + jae OversizedArray + + CALL_GETTHREAD + mov r11, rax + + ; we need to load the true method table from the type desc + mov r9, [rcx + OFFSETOF__ArrayTypeDesc__m_TemplateMT - 2] + + FIX_INDIRECTION r9 + + ; In this case we know the element size is sizeof(void *), or 8 for x64 + ; This helps us in two ways - we can shift instead of multiplying, and + ; there's no need to align the size either + + mov r8d, dword ptr [r9 + OFFSET__MethodTable__m_BaseSize] + lea r8d, [r8d + edx * 8] + + ; No need for rounding in this case - element size is 8, and m_BaseSize is guaranteed + ; to be a multiple of 8. + + mov r10, [r11 + OFFSET__Thread__m_alloc_context__alloc_limit] + mov rax, [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr] + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + mov [r11 + OFFSET__Thread__m_alloc_context__alloc_ptr], r8 + mov [rax], r9 + + mov dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain +endif ; _DEBUG + + add rsp, MIN_SIZE + ret + + OversizedArray: + AllocFailed: + add rsp, MIN_SIZE + jmp JIT_NewArr1 +NESTED_END JIT_NewArr1OBJ_MP, _TEXT + + + +; <TODO> this m_GCLock should be a size_t so we don't have a store-forwarding penalty in the code below. +; Unfortunately, the compiler intrinsic for InterlockedExchangePointer seems to be broken and we +; get bad code gen in gc.cpp on IA64. </TODO> + +M_GCLOCK equ ?m_GCLock@@3HC +extern M_GCLOCK:dword +extern generation_table:qword + +LEAF_ENTRY JIT_TrialAllocSFastSP, _TEXT + + mov r8d, [rcx + OFFSET__MethodTable__m_BaseSize] + + ; m_BaseSize is guaranteed to be a multiple of 8. + + inc [M_GCLOCK] + jnz JIT_NEW + + mov rax, [generation_table + 0] ; alloc_ptr + mov r10, [generation_table + 8] ; limit_ptr + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + mov qword ptr [generation_table + 0], r8 ; update the alloc ptr + mov [rax], rcx + mov [M_GCLOCK], -1 + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain_NoScratchArea +endif ; _DEBUG + + ret + + AllocFailed: + mov [M_GCLOCK], -1 + jmp JIT_NEW +LEAF_END JIT_TrialAllocSFastSP, _TEXT + +; HCIMPL2(Object*, JIT_Box, CORINFO_CLASS_HANDLE type, void* unboxedData) +NESTED_ENTRY JIT_BoxFastUP, _TEXT + + mov rax, [rcx + OFFSETOF__MethodTable__m_pWriteableData] + + ; Check whether the class has not been initialized + test dword ptr [rax + OFFSETOF__MethodTableWriteableData__m_dwFlags], MethodTableWriteableData__enum_flag_Unrestored + jnz JIT_Box + + mov r8d, [rcx + OFFSET__MethodTable__m_BaseSize] + + ; m_BaseSize is guaranteed to be a multiple of 8. + + inc [M_GCLOCK] + jnz JIT_Box + + mov rax, [generation_table + 0] ; alloc_ptr + mov r10, [generation_table + 8] ; limit_ptr + + add r8, rax + + cmp r8, r10 + ja NoAlloc + + + mov qword ptr [generation_table + 0], r8 ; update the alloc ptr + mov [rax], rcx + mov [M_GCLOCK], -1 + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain_NoScratchArea +endif ; _DEBUG + + ; Check whether the object contains pointers + test dword ptr [rcx + OFFSETOF__MethodTable__m_dwFlags], MethodTable__enum_flag_ContainsPointers + jnz ContainsPointers + + ; We have no pointers - emit a simple inline copy loop + + mov ecx, [rcx + OFFSET__MethodTable__m_BaseSize] + sub ecx, 18h ; sizeof(ObjHeader) + sizeof(Object) + last slot + + CopyLoop: + mov r8, [rdx+rcx] + mov [rax+rcx+8], r8 + + sub ecx, 8 + jge CopyLoop + REPRET + + ContainsPointers: + + ; Do call to CopyValueClassUnchecked(object, data, pMT) + + push_vol_reg rax + alloc_stack 20h + END_PROLOGUE + + mov r8, rcx + lea rcx, [rax + 8] + call CopyValueClassUnchecked + + add rsp, 20h + pop rax + ret + + NoAlloc: + mov [M_GCLOCK], -1 + jmp JIT_Box +NESTED_END JIT_BoxFastUP, _TEXT + +LEAF_ENTRY AllocateStringFastUP, _TEXT + + ; We were passed the number of characters in ECX + + ; we need to load the method table for string from the global + + mov r11, [g_pStringClass] + + ; Instead of doing elaborate overflow checks, we just limit the number of elements + ; to (LARGE_OBJECT_SIZE - 256)/sizeof(WCHAR) or less. + ; This will avoid all overflow problems, as well as making sure + ; big string objects are correctly allocated in the big object heap. + + cmp ecx, (ASM_LARGE_OBJECT_SIZE - 256)/2 + jae FramedAllocateString + + mov r8d, [r11 + OFFSET__MethodTable__m_BaseSize] + + ; Calculate the final size to allocate. + ; We need to calculate baseSize + cnt*2, then round that up by adding 7 and anding ~7. + + lea r8d, [r8d + ecx*2 + 7] + and r8d, -8 + + inc [M_GCLOCK] + jnz FramedAllocateString + + mov rax, [generation_table + 0] ; alloc_ptr + mov r10, [generation_table + 8] ; limit_ptr + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + mov qword ptr [generation_table + 0], r8 ; update the alloc ptr + mov [rax], r11 + mov [M_GCLOCK], -1 + + mov [rax + OFFSETOF__StringObject__m_StringLength], ecx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain_NoScratchArea +endif ; _DEBUG + + ret + + AllocFailed: + mov [M_GCLOCK], -1 + jmp FramedAllocateString +LEAF_END AllocateStringFastUP, _TEXT + +; HCIMPL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size) +LEAF_ENTRY JIT_NewArr1VC_UP, _TEXT + + ; We were passed a type descriptor in RCX, which contains the (shared) + ; array method table and the element type. + + ; The element count is in RDX + + ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need + ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray. + + ; Do a conservative check here. This is to avoid overflow while doing the calculations. We don't + ; have to worry about "large" objects, since the allocation quantum is never big enough for + ; LARGE_OBJECT_SIZE. + + ; For Value Classes, this needs to be 2^16 - slack (2^32 / max component size), + ; The slack includes the size for the array header and round-up ; for alignment. Use 256 for the + ; slack value out of laziness. + + ; In both cases we do a final overflow check after adding to the alloc_ptr. + + ; we need to load the true method table from the type desc + mov r9, [rcx + OFFSETOF__ArrayTypeDesc__m_TemplateMT - 2] + + FIX_INDIRECTION r9 + + cmp rdx, (65535 - 256) + jae JIT_NewArr1 + + movzx r8d, word ptr [r9 + OFFSETOF__MethodTable__m_dwFlags] ; component size is low 16 bits + imul r8d, edx ; signed mul, but won't overflow due to length restriction above + add r8d, dword ptr [r9 + OFFSET__MethodTable__m_BaseSize] + + ; round the size to a multiple of 8 + + add r8d, 7 + and r8d, -8 + + inc [M_GCLOCK] + jnz JIT_NewArr1 + + mov rax, [generation_table + 0] ; alloc_ptr + mov r10, [generation_table + 8] ; limit_ptr + + add r8, rax + jc AllocFailed + + cmp r8, r10 + ja AllocFailed + + mov qword ptr [generation_table + 0], r8 ; update the alloc ptr + mov [rax], r9 + mov [M_GCLOCK], -1 + + mov dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain_NoScratchArea +endif ; _DEBUG + + ret + + AllocFailed: + mov [M_GCLOCK], -1 + jmp JIT_NewArr1 +LEAF_END JIT_NewArr1VC_UP, _TEXT + + +; HCIMPL2(Object*, JIT_NewArr1, CORINFO_CLASS_HANDLE arrayTypeHnd_, INT_PTR size) +LEAF_ENTRY JIT_NewArr1OBJ_UP, _TEXT + + ; We were passed a type descriptor in RCX, which contains the (shared) + ; array method table and the element type. + + ; The element count is in RDX + + ; NOTE: if this code is ported for CORINFO_HELP_NEWSFAST_ALIGN8, it will need + ; to emulate the double-specific behavior of JIT_TrialAlloc::GenAllocArray. + + ; Verifies that LARGE_OBJECT_SIZE fits in 32-bit. This allows us to do array size + ; arithmetic using 32-bit registers. + .erre ASM_LARGE_OBJECT_SIZE lt 100000000h + + cmp rdx, (ASM_LARGE_OBJECT_SIZE - 256)/8 ; sizeof(void*) + jae OversizedArray + + ; we need to load the true method table from the type desc + mov r9, [rcx + OFFSETOF__ArrayTypeDesc__m_TemplateMT - 2] + + FIX_INDIRECTION r9 + + ; In this case we know the element size is sizeof(void *), or 8 for x64 + ; This helps us in two ways - we can shift instead of multiplying, and + ; there's no need to align the size either + + mov r8d, dword ptr [r9 + OFFSET__MethodTable__m_BaseSize] + lea r8d, [r8d + edx * 8] + + ; No need for rounding in this case - element size is 8, and m_BaseSize is guaranteed + ; to be a multiple of 8. + + inc [M_GCLOCK] + jnz JIT_NewArr1 + + mov rax, [generation_table + 0] ; alloc_ptr + mov r10, [generation_table + 8] ; limit_ptr + + add r8, rax + + cmp r8, r10 + ja AllocFailed + + mov qword ptr [generation_table + 0], r8 ; update the alloc ptr + mov [rax], r9 + mov [M_GCLOCK], -1 + + mov dword ptr [rax + OFFSETOF__ArrayBase__m_NumComponents], edx + +ifdef _DEBUG + call DEBUG_TrialAllocSetAppDomain_NoScratchArea +endif ; _DEBUG + + ret + + AllocFailed: + mov [M_GCLOCK], -1 + + OversizedArray: + jmp JIT_NewArr1 +LEAF_END JIT_NewArr1OBJ_UP, _TEXT + + +NESTED_ENTRY JIT_GetSharedNonGCStaticBase_Slow, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; Check if rcx (moduleDomainID) is not a moduleID + test rcx, 1 + jz HaveLocalModule + + CALL_GETAPPDOMAIN + + ; Get the LocalModule + mov rax, [rax + OFFSETOF__AppDomain__m_sDomainLocalBlock + OFFSETOF__DomainLocalBlock__m_pModuleSlots] + ; rcx will always be odd, so: rcx * 4 - 4 <=> (rcx >> 1) * 8 + mov rcx, [rax + rcx * 4 - 4] + + HaveLocalModule: + ; If class is not initialized, bail to C++ helper + test [rcx + OFFSETOF__DomainLocalModule__m_pDataBlob + rdx], 1 + jz CallHelper + + mov rax, rcx + add rsp, MIN_SIZE + ret + + align 16 + CallHelper: + ; Tail call Jit_GetSharedNonGCStaticBase_Helper + add rsp, MIN_SIZE + jmp JIT_GetSharedNonGCStaticBase_Helper +NESTED_END JIT_GetSharedNonGCStaticBase_Slow, _TEXT + +NESTED_ENTRY JIT_GetSharedNonGCStaticBaseNoCtor_Slow, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; Check if rcx (moduleDomainID) is not a moduleID + test rcx, 1 + jz HaveLocalModule + + CALL_GETAPPDOMAIN + + ; Get the LocalModule + mov rax, [rax + OFFSETOF__AppDomain__m_sDomainLocalBlock + OFFSETOF__DomainLocalBlock__m_pModuleSlots] + ; rcx will always be odd, so: rcx * 4 - 4 <=> (rcx >> 1) * 8 + mov rax, [rax + rcx * 4 - 4] + + add rsp, MIN_SIZE + ret + + align 16 + HaveLocalModule: + mov rax, rcx + add rsp, MIN_SIZE + ret +NESTED_END JIT_GetSharedNonGCStaticBaseNoCtor_Slow, _TEXT + +NESTED_ENTRY JIT_GetSharedGCStaticBase_Slow, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; Check if rcx (moduleDomainID) is not a moduleID + test rcx, 1 + jz HaveLocalModule + + CALL_GETAPPDOMAIN + + ; Get the LocalModule + mov rax, [rax + OFFSETOF__AppDomain__m_sDomainLocalBlock + OFFSETOF__DomainLocalBlock__m_pModuleSlots] + ; rcx will always be odd, so: rcx * 4 - 4 <=> (rcx >> 1) * 8 + mov rcx, [rax + rcx * 4 - 4] + + HaveLocalModule: + ; If class is not initialized, bail to C++ helper + test [rcx + OFFSETOF__DomainLocalModule__m_pDataBlob + rdx], 1 + jz CallHelper + + mov rax, [rcx + OFFSETOF__DomainLocalModule__m_pGCStatics] + + add rsp, MIN_SIZE + ret + + align 16 + CallHelper: + ; Tail call Jit_GetSharedGCStaticBase_Helper + add rsp, MIN_SIZE + jmp JIT_GetSharedGCStaticBase_Helper +NESTED_END JIT_GetSharedGCStaticBase_Slow, _TEXT + +NESTED_ENTRY JIT_GetSharedGCStaticBaseNoCtor_Slow, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; Check if rcx (moduleDomainID) is not a moduleID + test rcx, 1 + jz HaveLocalModule + + CALL_GETAPPDOMAIN + + ; Get the LocalModule + mov rax, [rax + OFFSETOF__AppDomain__m_sDomainLocalBlock + OFFSETOF__DomainLocalBlock__m_pModuleSlots] + ; rcx will always be odd, so: rcx * 4 - 4 <=> (rcx >> 1) * 8 + mov rcx, [rax + rcx * 4 - 4] + + HaveLocalModule: + mov rax, [rcx + OFFSETOF__DomainLocalModule__m_pGCStatics] + + add rsp, MIN_SIZE + ret +NESTED_END JIT_GetSharedGCStaticBaseNoCtor_Slow, _TEXT + + +MON_ENTER_STACK_SIZE equ 00000020h +MON_EXIT_STACK_SIZE equ 00000068h + +ifdef MON_DEBUG +ifdef TRACK_SYNC +MON_ENTER_STACK_SIZE_INLINEGETTHREAD equ 00000020h +MON_EXIT_STACK_SIZE_INLINEGETTHREAD equ 00000068h +endif +endif + +BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX equ 08000000h ; syncblk.h +BIT_SBLK_IS_HASHCODE equ 04000000h ; syncblk.h +BIT_SBLK_SPIN_LOCK equ 10000000h ; syncblk.h + +SBLK_MASK_LOCK_THREADID equ 000003FFh ; syncblk.h +SBLK_LOCK_RECLEVEL_INC equ 00000400h ; syncblk.h +SBLK_MASK_LOCK_RECLEVEL equ 0000FC00h ; syncblk.h + +MASK_SYNCBLOCKINDEX equ 03FFFFFFh ; syncblk.h +STATE_CHECK equ 0FFFFFFFEh + +MT_CTX_PROXY_FLAG equ 10000000h + +g_pSyncTable equ ?g_pSyncTable@@3PEAVSyncTableEntry@@EA +g_SystemInfo equ ?g_SystemInfo@@3U_SYSTEM_INFO@@A +g_SpinConstants equ ?g_SpinConstants@@3USpinConstants@@A + +extern g_pSyncTable:QWORD +extern g_SystemInfo:QWORD +extern g_SpinConstants:QWORD + +; JITutil_MonEnterWorker(Object* obj, BYTE* pbLockTaken) +extern JITutil_MonEnterWorker:proc +; JITutil_MonTryEnter(Object* obj, INT32 timeout, BYTE* pbLockTaken) +extern JITutil_MonTryEnter:proc +; JITutil_MonExitWorker(Object* obj, BYTE* pbLockTaken) +extern JITutil_MonExitWorker:proc +; JITutil_MonSignal(AwareLock* lock, BYTE* pbLockTaken) +extern JITutil_MonSignal:proc +; JITutil_MonContention(AwareLock* lock, BYTE* pbLockTaken) +extern JITutil_MonContention:proc + +ifdef _DEBUG +MON_DEBUG equ 1 +endif + +ifdef MON_DEBUG +ifdef TRACK_SYNC +extern EnterSyncHelper:proc +extern LeaveSyncHelper:proc +endif +endif + + +; This is a frameless helper for entering a monitor on a object. +; The object is in ARGUMENT_REG1. This tries the normal case (no +; blocking or object allocation) in line and calls a framed helper +; for the other cases. +; +; EXTERN_C void JIT_MonEnterWorker_Slow(Object* obj, /*OUT*/ BYTE* pbLockTaken) +NESTED_ENTRY JIT_MonEnterWorker_Slow, _TEXT + push_nonvol_reg rsi + + alloc_stack MON_ENTER_STACK_SIZE + + save_reg_postrsp rcx, MON_ENTER_STACK_SIZE + 10h + 0h + save_reg_postrsp rdx, MON_ENTER_STACK_SIZE + 10h + 8h + save_reg_postrsp r8, MON_ENTER_STACK_SIZE + 10h + 10h + save_reg_postrsp r9, MON_ENTER_STACK_SIZE + 10h + 18h + + END_PROLOGUE + + ; Check if the instance is NULL + test rcx, rcx + jz FramedLockHelper + + ; Put pbLockTaken in rsi, this can be null + mov rsi, rdx + + ; We store the thread object in r11 + CALL_GETTHREAD + mov r11, rax + + ; Initialize delay value for retry with exponential backoff + mov r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwInitialDuration] + + ; Check if we can abort here + mov eax, dword ptr [r11 + OFFSETOF__Thread__m_State] + and eax, THREAD_CATCHATSAFEPOINT_BITS + ; Go through the slow code path to initiate ThreadAbort + jnz FramedLockHelper + + ; r8 will hold the syncblockindex address + lea r8, [rcx - OFFSETOF__ObjHeader__SyncBlkIndex] + + RetryThinLock: + ; Fetch the syncblock dword + mov eax, dword ptr [r8] + + ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit is not set + test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL + jnz NeedMoreTests + + ; Everything is fine - get the thread id to store in the lock + mov edx, dword ptr [r11 + OFFSETOF__Thread__m_ThreadId] + + ; If the thread id is too large, we need a syncblock for sure + cmp edx, SBLK_MASK_LOCK_THREADID + ja FramedLockHelper + + ; We want to store a new value with the current thread id set in the low 10 bits + or edx, eax + lock cmpxchg dword ptr [r8], edx + jnz PrepareToWaitThinLock + + ; Everything went fine and we're done + add dword ptr [r11 + OFFSETOF__Thread__m_dwLockCount], 1 + + ; Done, leave and set pbLockTaken if we have it + jmp LockTaken + + NeedMoreTests: + ; OK, not the simple case, find out which case it is + test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + jnz HaveHashOrSyncBlockIndex + + ; The header is transitioning or the lock, treat this as if the lock was taken + test eax, BIT_SBLK_SPIN_LOCK + jnz PrepareToWaitThinLock + + ; Here we know we have the "thin lock" layout, but the lock is not free. + ; It could still be the recursion case, compare the thread id to check + mov edx, eax + and edx, SBLK_MASK_LOCK_THREADID + cmp edx, dword ptr [r11 + OFFSETOF__Thread__m_ThreadId] + jne PrepareToWaitThinLock + + ; Ok, the thread id matches, it's the recursion case. + ; Bump up the recursion level and check for overflow + lea edx, [eax + SBLK_LOCK_RECLEVEL_INC] + test edx, SBLK_MASK_LOCK_RECLEVEL + jz FramedLockHelper + + ; Try to put the new recursion level back. If the header was changed in the meantime + ; we need a full retry, because the layout could have changed + lock cmpxchg dword ptr [r8], edx + jnz RetryHelperThinLock + + ; Done, leave and set pbLockTaken if we have it + jmp LockTaken + + PrepareToWaitThinLock: + ; If we are on an MP system, we try spinning for a certain number of iterations + cmp dword ptr [g_SystemInfo + OFFSETOF__g_SystemInfo__dwNumberOfProcessors], 1 + jle FramedLockHelper + + ; Exponential backoff; delay by approximately 2*r10 clock cycles + mov eax, r10d + delayLoopThinLock: + pause ; indicate to the CPU that we are spin waiting + sub eax, 1 + jnz delayLoopThinLock + + ; Next time, wait a factor longer + imul r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwBackoffFactor] + + cmp r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwMaximumDuration] + jle RetryHelperThinLock + + jmp FramedLockHelper + + RetryHelperThinLock: + jmp RetryThinLock + + HaveHashOrSyncBlockIndex: + ; If we have a hash code already, we need to create a sync block + test eax, BIT_SBLK_IS_HASHCODE + jnz FramedLockHelper + + ; OK, we have a sync block index, just and out the top bits and grab the synblock index + and eax, MASK_SYNCBLOCKINDEX + + ; Get the sync block pointer + mov rdx, qword ptr [g_pSyncTable] + shl eax, 4h + mov rdx, [rdx + rax + OFFSETOF__SyncTableEntry__m_SyncBlock] + + ; Check if the sync block has been allocated + test rdx, rdx + jz FramedLockHelper + + ; Get a pointer to the lock object + lea rdx, [rdx + OFFSETOF__SyncBlock__m_Monitor] + + ; Attempt to acquire the lock + RetrySyncBlock: + mov eax, dword ptr [rdx + OFFSETOF__AwareLock__m_MonitorHeld] + test eax, eax + jne HaveWaiters + + ; Common case, lock isn't held and there are no waiters. Attempt to + ; gain ownership ourselves + xor ecx, ecx + inc ecx + lock cmpxchg dword ptr [rdx + OFFSETOF__AwareLock__m_MonitorHeld], ecx + jnz RetryHelperSyncBlock + + ; Success. Save the thread object in the lock and increment the use count + mov qword ptr [rdx + OFFSETOF__AwareLock__m_HoldingThread], r11 + add dword ptr [rdx + OFFSETOF__AwareLock__m_Recursion], 1 + add dword ptr [r11 + OFFSETOF__Thread__m_dwLockCount], 1 + +ifdef MON_DEBUG +ifdef TRACK_SYNC + mov rcx, [rsp + MON_ENTER_STACK_SIZE + 8h] ; return address + ; void EnterSyncHelper(UINT_PTR caller, AwareLock* lock) + call EnterSyncHelper +endif +endif + + ; Done, leave and set pbLockTaken if we have it + jmp LockTaken + + ; It's possible to get here with waiters by no lock held, but in this + ; case a signal is about to be fired which will wake up the waiter. So + ; for fairness sake we should wait too. + ; Check first for recur11ve lock attempts on the same thread. + HaveWaiters: + ; Is mutex already owned by current thread? + cmp [rdx + OFFSETOF__AwareLock__m_HoldingThread], r11 + jne PrepareToWait + + ; Yes, bump our use count. + add dword ptr [rdx + OFFSETOF__AwareLock__m_Recursion], 1 + +ifdef MON_DEBUG +ifdef TRACK_SYNC + mov rcx, [rsp + MON_ENTER_STACK_SIZE + 8h] ; return address + ; void EnterSyncHelper(UINT_PTR caller, AwareLock* lock) + call EnterSyncHelper +endif +endif + ; Done, leave and set pbLockTaken if we have it + jmp LockTaken + + PrepareToWait: + ; If we are on a MP system we try spinning for a certain number of iterations + cmp dword ptr [g_SystemInfo + OFFSETOF__g_SystemInfo__dwNumberOfProcessors], 1 + jle HaveWaiters1 + + ; Exponential backoff: delay by approximately 2*r10 clock cycles + mov eax, r10d + delayLoop: + pause ; indicate to the CPU that we are spin waiting + sub eax, 1 + jnz delayLoop + + ; Next time, wait a factor longer + imul r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwBackoffFactor] + + cmp r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwMaximumDuration] + jle RetrySyncBlock + + HaveWaiters1: + mov rcx, rdx + mov rdx, rsi + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ; void JITutil_MonContention(AwareLock* lock, BYTE* pbLockTaken) + jmp JITutil_MonContention + + RetryHelperSyncBlock: + jmp RetrySyncBlock + + FramedLockHelper: + mov rdx, rsi + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ; void JITutil_MonEnterWorker(Object* obj, BYTE* pbLockTaken) + jmp JITutil_MonEnterWorker + + align 16 + ; This is sensitive to the potential that pbLockTaken is NULL + LockTaken: + test rsi, rsi + jz LockTaken_Exit + mov byte ptr [rsi], 1 + LockTaken_Exit: + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ret +NESTED_END JIT_MonEnterWorker_Slow, _TEXT + +; This is a frameless helper for exiting a monitor on a object. +; The object is in ARGUMENT_REG1. This tries the normal case (no +; blocking or object allocation) in line and calls a framed helper +; for the other cases. +; +; void JIT_MonExitWorker_Slow(Object* obj, BYTE* pbLockTaken) +NESTED_ENTRY JIT_MonExitWorker_Slow, _TEXT + alloc_stack MON_EXIT_STACK_SIZE + + save_reg_postrsp rcx, MON_EXIT_STACK_SIZE + 8h + 0h + save_reg_postrsp rdx, MON_EXIT_STACK_SIZE + 8h + 8h + save_reg_postrsp r8, MON_EXIT_STACK_SIZE + 8h + 10h + save_reg_postrsp r9, MON_EXIT_STACK_SIZE + 8h + 18h + + END_PROLOGUE + + ; pbLockTaken is stored in r10 + mov r10, rdx + + ; if pbLockTaken is NULL then we got here without a state variable, avoid the + ; next comparison in that case as it will AV + test rdx, rdx + jz Null_pbLockTaken + + ; If the lock wasn't taken then we bail quickly without doing anything + cmp byte ptr [rdx], 0 + je LockNotTaken + + Null_pbLockTaken: + ; Check is the instance is null + test rcx, rcx + jz FramedLockHelper + + ; The Thread obj address is stored in r11 + CALL_GETTHREAD + mov r11, rax + + ; r8 will hold the syncblockindex address + lea r8, [rcx - OFFSETOF__ObjHeader__SyncBlkIndex] + + RetryThinLock: + ; Fetch the syncblock dword + mov eax, dword ptr [r8] + test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + jnz NeedMoreTests + + ; Ok, we have a "thin lock" layout - check whether the thread id matches + mov edx, eax + and edx, SBLK_MASK_LOCK_THREADID + cmp edx, dword ptr [r11 + OFFSETOF__Thread__m_ThreadId] + jne FramedLockHelper + + ; check the recursion level + test eax, SBLK_MASK_LOCK_RECLEVEL + jne DecRecursionLevel + + ; It's zero -- we're leaving the lock. + ; So try to put back a zero thread id. + ; edx and eax match in the thread id bits, and edx is zero else where, so the xor is sufficient + xor edx, eax + lock cmpxchg dword ptr [r8], edx + jnz RetryHelperThinLock + + ; Dec the dwLockCount on the thread + sub dword ptr [r11 + OFFSETOF__Thread__m_dwLockCount], 1 + + ; Done, leave and set pbLockTaken if we have it + jmp LockReleased + + DecRecursionLevel: + lea edx, [eax - SBLK_LOCK_RECLEVEL_INC] + lock cmpxchg dword ptr [r8], edx + jnz RetryHelperThinLock + + ; We're done, leave and set pbLockTaken if we have it + jmp LockReleased + + NeedMoreTests: + ; Forward all special cases to the slow helper + test eax, BIT_SBLK_IS_HASHCODE + BIT_SBLK_SPIN_LOCK + jnz FramedLockHelper + + ; Get the sync block index and use it to compute the sync block pointer + mov rdx, qword ptr [g_pSyncTable] + and eax, MASK_SYNCBLOCKINDEX + shl eax, 4 + mov rdx, [rdx + rax + OFFSETOF__SyncTableEntry__m_SyncBlock] + + ; Was there a sync block? + test rdx, rdx + jz FramedLockHelper + + ; Get a pointer to the lock object. + lea rdx, [rdx + OFFSETOF__SyncBlock__m_Monitor] + + ; Check if the lock is held. + cmp qword ptr [rdx + OFFSETOF__AwareLock__m_HoldingThread], r11 + jne FramedLockHelper + +ifdef MON_DEBUG +ifdef TRACK_SYNC + mov [rsp + 28h], rcx + mov [rsp + 30h], rdx + mov [rsp + 38h], r10 + mov [rsp + 40h], r11 + + mov rcx, [rsp + MON_EXIT_STACK_SIZE ] ; return address + ; void LeaveSyncHelper(UINT_PTR caller, AwareLock* lock) + call LeaveSyncHelper + + mov rcx, [rsp + 28h] + mov rdx, [rsp + 30h] + mov r10, [rsp + 38h] + mov r11, [rsp + 40h] +endif +endif + + ; Reduce our recursion count + sub dword ptr [rdx + OFFSETOF__AwareLock__m_Recursion], 1 + jz LastRecursion + + ; Done, leave and set pbLockTaken if we have it + jmp LockReleased + + RetryHelperThinLock: + jmp RetryThinLock + + FramedLockHelper: + mov rdx, r10 + add rsp, MON_EXIT_STACK_SIZE + ; void JITutil_MonExitWorker(Object* obj, BYTE* pbLockTaken) + jmp JITutil_MonExitWorker + + LastRecursion: +ifdef MON_DEBUG +ifdef TRACK_SYNC + mov rax, [rdx + OFFSETOF__AwareLock__m_HoldingThread] +endif +endif + + sub dword ptr [r11 + OFFSETOF__Thread__m_dwLockCount], 1 + mov qword ptr [rdx + OFFSETOF__AwareLock__m_HoldingThread], 0 + + Retry: + mov eax, dword ptr [rdx + OFFSETOF__AwareLock__m_MonitorHeld] + lea r9d, [eax - 1] + lock cmpxchg dword ptr [rdx + OFFSETOF__AwareLock__m_MonitorHeld], r9d + jne RetryHelper + + test eax, STATE_CHECK + jne MustSignal + + ; Done, leave and set pbLockTaken if we have it + jmp LockReleased + + MustSignal: + mov rcx, rdx + mov rdx, r10 + add rsp, MON_EXIT_STACK_SIZE + ; void JITutil_MonSignal(AwareLock* lock, BYTE* pbLockTaken) + jmp JITutil_MonSignal + + RetryHelper: + jmp Retry + + LockNotTaken: + add rsp, MON_EXIT_STACK_SIZE + ret + + align 16 + ; This is sensitive to the potential that pbLockTaken is null + LockReleased: + test r10, r10 + jz LockReleased_Exit + mov byte ptr [r10], 0 + LockReleased_Exit: + add rsp, MON_EXIT_STACK_SIZE + ret +NESTED_END JIT_MonExitWorker_Slow, _TEXT + +; This is a frameless helper for trying to enter a monitor on a object. +; The object is in ARGUMENT_REG1 and a timeout in ARGUMENT_REG2. This tries the +; normal case (no object allocation) in line and calls a framed helper for the +; other cases. +; +; void JIT_MonTryEnter_Slow(Object* obj, INT32 timeOut, BYTE* pbLockTaken) +NESTED_ENTRY JIT_MonTryEnter_Slow, _TEXT + push_nonvol_reg rsi + + alloc_stack MON_ENTER_STACK_SIZE + + save_reg_postrsp rcx, MON_ENTER_STACK_SIZE + 10h + 0h + save_reg_postrsp rdx, MON_ENTER_STACK_SIZE + 10h + 8h + save_reg_postrsp r8, MON_ENTER_STACK_SIZE + 10h + 10h + save_reg_postrsp r9, MON_ENTER_STACK_SIZE + 10h + 18h + + END_PROLOGUE + + mov rsi, rdx + + ; Check if the instance is NULL + test rcx, rcx + jz FramedLockHelper + + ; Check if the timeout looks valid + cmp rdx, -1 + jl FramedLockHelper + + ; We store the thread object in r11 + CALL_GETTHREAD + mov r11, rax + + ; Initialize delay value for retry with exponential backoff + mov r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwInitialDuration] + + ; Check if we can abort here + mov eax, dword ptr [r11 + OFFSETOF__Thread__m_State] + and eax, THREAD_CATCHATSAFEPOINT_BITS + ; Go through the slow code path to initiate THreadAbort + jnz FramedLockHelper + + ; r9 will hold the syncblockindex address + lea r9, [rcx - OFFSETOF__ObjHeader__SyncBlkIndex] + + RetryThinLock: + ; Fetch the syncblock dword + mov eax, dword ptr [r9] + + ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit is not set + test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL + jne NeedMoreTests + + ; Everything is fine - get the thread id to store in the lock + mov edx, dword ptr [r11 + OFFSETOF__Thread__m_ThreadId] + + ; If the thread id is too large, we need a syncblock for sure + cmp edx, SBLK_MASK_LOCK_THREADID + ja FramedLockHelper + + ; We want to store a new value with the current thread id set in the low 10 bits + or edx, eax + lock cmpxchg dword ptr [r9], edx + jnz RetryHelperThinLock + + ; Got the lock, everything is fine + add dword ptr [r11 + OFFSETOF__Thread__m_dwLockCount], 1 + ; Return TRUE + mov byte ptr [r8], 1 + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ret + + NeedMoreTests: + ; OK, not the simple case, find out which case it is + test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + jnz HaveHashOrSyncBlockIndex + + ; The header is transitioning or the lock + test eax, BIT_SBLK_SPIN_LOCK + jnz RetryHelperThinLock + + ; Here we know we have the "thin lock" layout, but the lock is not free. + ; It could still be the recursion case, compare the thread id to check + mov edx, eax + and edx, SBLK_MASK_LOCK_THREADID + cmp edx, dword ptr [r11 + OFFSETOF__Thread__m_ThreadId] + jne PrepareToWaitThinLock + + ; Ok, the thread id matches, it's the recursion case. + ; Dump up the recursion level and check for overflow + lea edx, [eax + SBLK_LOCK_RECLEVEL_INC] + test edx, SBLK_MASK_LOCK_RECLEVEL + jz FramedLockHelper + + ; Try to put the new recursion level back. If the header was changed in the meantime + ; we need a full retry, because the layout could have changed + lock cmpxchg dword ptr [r9], edx + jnz RetryHelperThinLock + + ; Everything went fine and we're done, return TRUE + mov byte ptr [r8], 1 + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ret + + PrepareToWaitThinLock: + ; Return failure if timeout is zero + test rsi, rsi + jz TimeoutZero + + ; If we are on an MP system, we try spinning for a certain number of iterations + cmp dword ptr [g_SystemInfo + OFFSETOF__g_SystemInfo__dwNumberOfProcessors], 1 + jle FramedLockHelper + + ; Exponential backoff; delay by approximately 2*r10d clock cycles + mov eax, r10d + DelayLoopThinLock: + pause ; indicate to the CPU that we are spin waiting + sub eax, 1 + jnz DelayLoopThinLock + + ; Next time, wait a factor longer + imul r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwBackoffFactor] + + cmp r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwMaximumDuration] + jle RetryHelperThinLock + + jmp FramedLockHelper + + RetryHelperThinLock: + jmp RetryThinLock + + HaveHashOrSyncBlockIndex: + ; If we have a hash code already, we need to create a sync block + test eax, BIT_SBLK_IS_HASHCODE + jnz FramedLockHelper + + ; OK, we have a sync block index, just and out the top bits and grab the synblock index + and eax, MASK_SYNCBLOCKINDEX + + ; Get the sync block pointer + mov rdx, qword ptr [g_pSyncTable] + shl eax, 4 + mov rdx, [rdx + rax + OFFSETOF__SyncTableEntry__m_SyncBlock] + + ; Check if the sync block has been allocated + test rdx, rdx + jz FramedLockHelper + + ; Get a pointer to the lock object + lea rdx, [rdx + OFFSETOF__SyncBlock__m_Monitor] + + RetrySyncBlock: + ; Attempt to acuire the lock + mov eax, dword ptr [rdx + OFFSETOF__AwareLock__m_MonitorHeld] + test eax, eax + jne HaveWaiters + + ; Common case, lock isn't held and there are no waiters. Attempt to + ; gain ownership ourselves + xor ecx, ecx + inc ecx + lock cmpxchg dword ptr [rdx + OFFSETOF__AwareLock__m_MonitorHeld], ecx + jnz RetryHelperSyncBlock + + ; Success. Save the thread object in the lock and increment the use count + mov qword ptr [rdx + OFFSETOF__AwareLock__m_HoldingThread], r11 + add dword ptr [rdx + OFFSETOF__AwareLock__m_Recursion], 1 + add dword ptr [r11 + OFFSETOF__Thread__m_dwLockCount], 1 + +ifdef MON_DEBUG +ifdef TRACK_SYNC + mov rcx, [rsp + MON_ENTER_STACK_SIZE + 8h] ; return address + ; void EnterSyncHelper(UINT_PTR caller, AwareLock* lock) + call EnterSyncHelper +endif +endif + + ; Return TRUE + mov byte ptr [r8], 1 + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ret + + ; It's possible to get here with waiters by no lock held, but in this + ; case a signal is about to be fired which will wake up the waiter. So + ; for fairness sake we should wait too. + ; Check first for recur11ve lock attempts on the same thread. + HaveWaiters: + ; Is mutex already owned by current thread? + cmp [rdx + OFFSETOF__AwareLock__m_HoldingThread], r11 + jne PrepareToWait + + ; Yes, bump our use count. + add dword ptr [rdx + OFFSETOF__AwareLock__m_Recursion], 1 + +ifdef MON_DEBUG +ifdef TRACK_SYNC + mov rcx, [rsp + MON_ENTER_STACK_SIZE + 8h] ; return address + ; void EnterSyncHelper(UINT_PTR caller, AwareLock* lock) + call EnterSyncHelper +endif +endif + + ; Return TRUE + mov byte ptr [r8], 1 + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ret + + PrepareToWait: + ; Return failure if timeout is zero + test rsi, rsi + jz TimeoutZero + + ; If we are on an MP system, we try spinning for a certain number of iterations + cmp dword ptr [g_SystemInfo + OFFSETOF__g_SystemInfo__dwNumberOfProcessors], 1 + jle Block + + ; Exponential backoff; delay by approximately 2*r10d clock cycles + mov eax, r10d + DelayLoop: + pause ; indicate to the CPU that we are spin waiting + sub eax, 1 + jnz DelayLoop + + ; Next time, wait a factor longer + imul r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwBackoffFactor] + + cmp r10d, dword ptr [g_SpinConstants + OFFSETOF__g_SpinConstants__dwMaximumDuration] + jle RetrySyncBlock + + jmp Block + + TimeoutZero: + ; Return FALSE + mov byte ptr [r8], 0 + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ret + + RetryHelperSyncBlock: + jmp RetrySyncBlock + + Block: + ; In the Block case we've trashed RCX, restore it + mov rcx, [rsp + MON_ENTER_STACK_SIZE + 10h] + FramedLockHelper: + mov rdx, rsi + add rsp, MON_ENTER_STACK_SIZE + pop rsi + ; void JITutil_MonTryEnter(Object* obj, UINT32 timeout, BYTE* pbLockTaken) + jmp JITutil_MonTryEnter + +NESTED_END JIT_MonTryEnter_Slow, _TEXT + +MON_ENTER_STATIC_RETURN_SUCCESS macro + ; pbLockTaken is never null for static helpers + mov byte ptr [rdx], 1 + add rsp, MIN_SIZE + ret + + endm + +MON_EXIT_STATIC_RETURN_SUCCESS macro + ; pbLockTaken is never null for static helpers + mov byte ptr [rdx], 0 + add rsp, MIN_SIZE + ret + + endm + + +; This is a frameless helper for entering a static monitor on a class. +; The methoddesc is in ARGUMENT_REG1. This tries the normal case (no +; blocking or object allocation) in line and calls a framed helper +; for the other cases. +; +; void JIT_MonEnterStatic_Slow(AwareLock *lock, BYTE *pbLockTaken) +NESTED_ENTRY JIT_MonEnterStatic_Slow, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + + ; Attempt to acquire the lock + Retry: + mov eax, dword ptr [rcx + OFFSETOF__AwareLock__m_MonitorHeld] + test eax, eax + jne HaveWaiters + + ; Common case; lock isn't held and there are no waiters. Attempt to + ; gain ownership by ourselves. + mov r10d, 1 + lock cmpxchg dword ptr [rcx + OFFSETOF__AwareLock__m_MonitorHeld], r10d + jnz RetryHelper + + ; Success. Save the thread object in the lock and increment the use count. + CALL_GETTHREAD + + mov qword ptr [rcx + OFFSETOF__AwareLock__m_HoldingThread], rax + add dword ptr [rcx + OFFSETOF__AwareLock__m_Recursion], 1 + add dword ptr [rax + OFFSETOF__Thread__m_dwLockCount], 1 + +ifdef MON_DEBUG +ifdef TRACK_SYNC + add rsp, MIN_SIZE + mov rdx, rcx + mov rcx, [rsp] + ; void EnterSyncHelper(UINT_PTR caller, AwareLock* lock) + jmp EnterSyncHelper +endif +endif + MON_ENTER_STATIC_RETURN_SUCCESS + + ; It's possible to get here with waiters by with no lock held, in this + ; case a signal is about to be fired which will wake up a waiter. So + ; for fairness sake we should wait too. + ; Check first for recursive lock attempts on the same thread. + HaveWaiters: + CALL_GETTHREAD + + ; Is mutex alread owned by current thread? + cmp [rcx + OFFSETOF__AwareLock__m_HoldingThread], rax + jne PrepareToWait + + ; Yes, bump our use count. + add dword ptr [rcx + OFFSETOF__AwareLock__m_Recursion], 1 +ifdef MON_DEBUG +ifdef TRACK_SYNC + mov rdx, rcx + mov rcx, [rsp] + ; void EnterSyncHelper(UINT_PTR caller, AwareLock* lock) + add rsp, MIN_SIZE + jmp EnterSyncHelper +endif +endif + MON_ENTER_STATIC_RETURN_SUCCESS + + PrepareToWait: + add rsp, MIN_SIZE + ; void JITutil_MonContention(AwareLock* obj, BYTE* pbLockTaken) + jmp JITutil_MonContention + + RetryHelper: + jmp Retry +NESTED_END JIT_MonEnterStatic_Slow, _TEXT + +; A frameless helper for exiting a static monitor on a class. +; The methoddesc is in ARGUMENT_REG1. This tries the normal case (no +; blocking or object allocation) in line and calls a framed helper +; for the other cases. +; +; void JIT_MonExitStatic_Slow(AwareLock *lock, BYTE *pbLockTaken) +NESTED_ENTRY JIT_MonExitStatic_Slow, _TEXT + alloc_stack MIN_SIZE + END_PROLOGUE + +ifdef MON_DEBUG +ifdef TRACK_SYNC + push rsi + push rdi + mov rsi, rcx + mov rdi, rdx + mov rdx, [rsp + 8] + call LeaveSyncHelper + mov rcx, rsi + mov rdx, rdi + pop rdi + pop rsi +endif +endif + + ; Check if lock is held + CALL_GETTHREAD + + cmp [rcx + OFFSETOF__AwareLock__m_HoldingThread], rax + jne LockError + + ; Reduce our recursion count + sub dword ptr [rcx + OFFSETOF__AwareLock__m_Recursion], 1 + jz LastRecursion + + MON_EXIT_STATIC_RETURN_SUCCESS + + ; This is the last count we held on this lock, so release the lock + LastRecursion: + ; Thead* is in rax + sub dword ptr [rax + OFFSETOF__Thread__m_dwLockCount], 1 + mov qword ptr [rcx + OFFSETOF__AwareLock__m_HoldingThread], 0 + + Retry: + mov eax, dword ptr [rcx + OFFSETOF__AwareLock__m_MonitorHeld] + lea r10d, [eax - 1] + lock cmpxchg dword ptr [rcx + OFFSETOF__AwareLock__m_MonitorHeld], r10d + jne RetryHelper + test eax, STATE_CHECK + jne MustSignal + + MON_EXIT_STATIC_RETURN_SUCCESS + + MustSignal: + add rsp, MIN_SIZE + ; void JITutil_MonSignal(AwareLock* lock, BYTE* pbLockTaken) + jmp JITutil_MonSignal + + RetryHelper: + jmp Retry + + LockError: + mov rcx, CORINFO_SynchronizationLockException_ASM + add rsp, MIN_SIZE + ; void JIT_InternalThrow(unsigned exceptNum) + jmp JIT_InternalThrow +NESTED_END JIT_MonExitStatic_Slow, _TEXT + + +ifdef _DEBUG + +extern Object__DEBUG_SetAppDomain:proc + +; +; IN: rax: new object needing the AppDomain ID set.. +; OUT: rax, returns original value at entry +; +; all integer register state is preserved +; +DEBUG_TrialAllocSetAppDomain_STACK_SIZE equ MIN_SIZE + 10h +NESTED_ENTRY DEBUG_TrialAllocSetAppDomain, _TEXT + push_vol_reg rax + push_vol_reg rcx + push_vol_reg rdx + push_vol_reg r8 + push_vol_reg r9 + push_vol_reg r10 + push_vol_reg r11 + push_nonvol_reg rbx + alloc_stack MIN_SIZE + END_PROLOGUE + + mov rbx, rax + + ; get the app domain ptr + CALL_GETAPPDOMAIN + + ; set the sync block app domain ID + mov rcx, rbx + mov rdx, rax + call Object__DEBUG_SetAppDomain + + ; epilog + add rsp, MIN_SIZE + pop rbx + pop r11 + pop r10 + pop r9 + pop r8 + pop rdx + pop rcx + pop rax + ret +NESTED_END DEBUG_TrialAllocSetAppDomain, _TEXT + +NESTED_ENTRY DEBUG_TrialAllocSetAppDomain_NoScratchArea, _TEXT + + push_nonvol_reg rbp + set_frame rbp, 0 + END_PROLOGUE + + sub rsp, 20h + and rsp, -16 + + call DEBUG_TrialAllocSetAppDomain + + lea rsp, [rbp+0] + pop rbp + ret +NESTED_END DEBUG_TrialAllocSetAppDomain_NoScratchArea, _TEXT + +endif + + + end + |