summaryrefslogtreecommitdiff
path: root/src/vm/i386/jithelp.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/vm/i386/jithelp.asm')
-rw-r--r--src/vm/i386/jithelp.asm2574
1 files changed, 2574 insertions, 0 deletions
diff --git a/src/vm/i386/jithelp.asm b/src/vm/i386/jithelp.asm
new file mode 100644
index 0000000000..ac767287ee
--- /dev/null
+++ b/src/vm/i386/jithelp.asm
@@ -0,0 +1,2574 @@
+; Licensed to the .NET Foundation under one or more agreements.
+; The .NET Foundation licenses this file to you under the MIT license.
+; See the LICENSE file in the project root for more information.
+
+; ==++==
+;
+
+;
+; ==--==
+; ***********************************************************************
+; File: JIThelp.asm
+;
+; ***********************************************************************
+;
+; *** NOTE: If you make changes to this file, propagate the changes to
+; jithelp.s in this directory
+
+; This contains JITinterface routines that are 100% x86 assembly
+
+ .586
+ .model flat
+
+ include asmconstants.inc
+
+ option casemap:none
+ .code
+;
+; <TODO>@TODO Switch to g_ephemeral_low and g_ephemeral_high
+; @TODO instead of g_lowest_address, g_highest address</TODO>
+;
+
+ARGUMENT_REG1 equ ecx
+ARGUMENT_REG2 equ edx
+g_ephemeral_low TEXTEQU <_g_ephemeral_low>
+g_ephemeral_high TEXTEQU <_g_ephemeral_high>
+g_lowest_address TEXTEQU <_g_lowest_address>
+g_highest_address TEXTEQU <_g_highest_address>
+g_card_table TEXTEQU <_g_card_table>
+WriteBarrierAssert TEXTEQU <_WriteBarrierAssert@8>
+JIT_LLsh TEXTEQU <_JIT_LLsh@0>
+JIT_LRsh TEXTEQU <_JIT_LRsh@0>
+JIT_LRsz TEXTEQU <_JIT_LRsz@0>
+JIT_LMul TEXTEQU <@JIT_LMul@16>
+JIT_Dbl2LngOvf TEXTEQU <@JIT_Dbl2LngOvf@8>
+JIT_Dbl2Lng TEXTEQU <@JIT_Dbl2Lng@8>
+JIT_Dbl2IntSSE2 TEXTEQU <@JIT_Dbl2IntSSE2@8>
+JIT_Dbl2LngP4x87 TEXTEQU <@JIT_Dbl2LngP4x87@8>
+JIT_Dbl2LngSSE3 TEXTEQU <@JIT_Dbl2LngSSE3@8>
+JIT_InternalThrowFromHelper TEXTEQU <@JIT_InternalThrowFromHelper@4>
+JIT_WriteBarrierReg_PreGrow TEXTEQU <_JIT_WriteBarrierReg_PreGrow@0>
+JIT_WriteBarrierReg_PostGrow TEXTEQU <_JIT_WriteBarrierReg_PostGrow@0>
+JIT_TailCall TEXTEQU <_JIT_TailCall@0>
+JIT_TailCallLeave TEXTEQU <_JIT_TailCallLeave@0>
+JIT_TailCallVSDLeave TEXTEQU <_JIT_TailCallVSDLeave@0>
+JIT_TailCallHelper TEXTEQU <_JIT_TailCallHelper@4>
+JIT_TailCallReturnFromVSD TEXTEQU <_JIT_TailCallReturnFromVSD@0>
+
+EXTERN g_ephemeral_low:DWORD
+EXTERN g_ephemeral_high:DWORD
+EXTERN g_lowest_address:DWORD
+EXTERN g_highest_address:DWORD
+EXTERN g_card_table:DWORD
+ifdef _DEBUG
+EXTERN WriteBarrierAssert:PROC
+endif ; _DEBUG
+EXTERN JIT_InternalThrowFromHelper:PROC
+ifdef FEATURE_HIJACK
+EXTERN JIT_TailCallHelper:PROC
+endif
+EXTERN _g_TailCallFrameVptr:DWORD
+EXTERN @JIT_FailFast@0:PROC
+EXTERN _s_gsCookie:DWORD
+EXTERN @JITutil_IsInstanceOfInterface@8:PROC
+EXTERN @JITutil_ChkCastInterface@8:PROC
+EXTERN @JITutil_IsInstanceOfAny@8:PROC
+EXTERN @JITutil_ChkCastAny@8:PROC
+ifdef FEATURE_IMPLICIT_TLS
+EXTERN _GetThread@0:PROC
+endif
+
+ifdef WRITE_BARRIER_CHECK
+; Those global variables are always defined, but should be 0 for Server GC
+g_GCShadow TEXTEQU <?g_GCShadow@@3PAEA>
+g_GCShadowEnd TEXTEQU <?g_GCShadowEnd@@3PAEA>
+EXTERN g_GCShadow:DWORD
+EXTERN g_GCShadowEnd:DWORD
+INVALIDGCVALUE equ 0CCCCCCCDh
+endif
+
+ifdef FEATURE_REMOTING
+EXTERN _TransparentProxyStub_CrossContext@0:PROC
+EXTERN _InContextTPQuickDispatchAsmStub@0:PROC
+endif
+
+.686P
+.XMM
+; The following macro is needed because of a MASM issue with the
+; movsd mnemonic
+;
+$movsd MACRO op1, op2
+ LOCAL begin_movsd, end_movsd
+begin_movsd:
+ movupd op1, op2
+end_movsd:
+ org begin_movsd
+ db 0F2h
+ org end_movsd
+ENDM
+.586
+
+; The following macro is used to match the JITs
+; multi-byte NOP sequence
+$nop3 MACRO
+ db 090h
+ db 090h
+ db 090h
+ENDM
+
+
+
+;***
+;JIT_WriteBarrier* - GC write barrier helper
+;
+;Purpose:
+; Helper calls in order to assign an object to a field
+; Enables book-keeping of the GC.
+;
+;Entry:
+; EDX - address of ref-field (assigned to)
+; the resp. other reg - RHS of assignment
+;
+;Exit:
+;
+;Uses:
+; EDX is destroyed.
+;
+;Exceptions:
+;
+;*******************************************************************************
+
+; The code here is tightly coupled with AdjustContextForWriteBarrier, if you change
+; anything here, you might need to change AdjustContextForWriteBarrier as well
+WriteBarrierHelper MACRO rg
+ ALIGN 4
+
+ ;; The entry point is the fully 'safe' one in which we check if EDX (the REF
+ ;; begin updated) is actually in the GC heap
+
+PUBLIC _JIT_CheckedWriteBarrier&rg&@0
+_JIT_CheckedWriteBarrier&rg&@0 PROC
+ ;; check in the REF being updated is in the GC heap
+ cmp edx, g_lowest_address
+ jb WriteBarrier_NotInHeap_&rg
+ cmp edx, g_highest_address
+ jae WriteBarrier_NotInHeap_&rg
+
+ ;; fall through to unchecked routine
+ ;; note that its entry point also happens to be aligned
+
+ifdef WRITE_BARRIER_CHECK
+ ;; This entry point is used when you know the REF pointer being updated
+ ;; is in the GC heap
+PUBLIC _JIT_DebugWriteBarrier&rg&@0
+_JIT_DebugWriteBarrier&rg&@0:
+endif
+
+ifdef _DEBUG
+ push edx
+ push ecx
+ push eax
+
+ push rg
+ push edx
+ call WriteBarrierAssert
+
+ pop eax
+ pop ecx
+ pop edx
+endif ;_DEBUG
+
+ ; in the !WRITE_BARRIER_CHECK case this will be the move for all
+ ; addresses in the GCHeap, addresses outside the GCHeap will get
+ ; taken care of below at WriteBarrier_NotInHeap_&rg
+
+ifndef WRITE_BARRIER_CHECK
+ mov DWORD PTR [edx], rg
+endif
+
+ifdef WRITE_BARRIER_CHECK
+ ; Test dest here so if it is bad AV would happen before we change register/stack
+ ; status. This makes job of AdjustContextForWriteBarrier easier.
+ cmp [edx], 0
+ ;; ALSO update the shadow GC heap if that is enabled
+ ; Make ebp into the temporary src register. We need to do this so that we can use ecx
+ ; in the calculation of the shadow GC address, but still have access to the src register
+ push ecx
+ push ebp
+ mov ebp, rg
+
+ ; if g_GCShadow is 0, don't perform the check
+ cmp g_GCShadow, 0
+ je WriteBarrier_NoShadow_&rg
+
+ mov ecx, edx
+ sub ecx, g_lowest_address ; U/V
+ jb WriteBarrier_NoShadow_&rg
+ add ecx, [g_GCShadow]
+ cmp ecx, [g_GCShadowEnd]
+ ja WriteBarrier_NoShadow_&rg
+
+ ; TODO: In Orcas timeframe if we move to P4+ only on X86 we should enable
+ ; mfence barriers on either side of these two writes to make sure that
+ ; they stay as close together as possible
+
+ ; edx contains address in GC
+ ; ecx contains address in ShadowGC
+ ; ebp temporarially becomes the src register
+
+ ;; When we're writing to the shadow GC heap we want to be careful to minimize
+ ;; the risk of a race that can occur here where the GC and ShadowGC don't match
+ mov DWORD PTR [edx], ebp
+ mov DWORD PTR [ecx], ebp
+
+ ;; We need a scratch register to verify the shadow heap. We also need to
+ ;; construct a memory barrier so that the write to the shadow heap happens
+ ;; before the read from the GC heap. We can do both by using SUB/XCHG
+ ;; rather than PUSH.
+ ;;
+ ;; TODO: Should be changed to a push if the mfence described above is added.
+ ;;
+ sub esp, 4
+ xchg [esp], eax
+
+ ;; As part of our race avoidance (see above) we will now check whether the values
+ ;; in the GC and ShadowGC match. There is a possibility that we're wrong here but
+ ;; being overaggressive means we might mask a case where someone updates GC refs
+ ;; without going to a write barrier, but by its nature it will be indeterminant
+ ;; and we will find real bugs whereas the current implementation is indeterminant
+ ;; but only leads to investigations that find that this code is fundamentally flawed
+ mov eax, [edx]
+ cmp [ecx], eax
+ je WriteBarrier_CleanupShadowCheck_&rg
+ mov [ecx], INVALIDGCVALUE
+
+WriteBarrier_CleanupShadowCheck_&rg:
+ pop eax
+
+ jmp WriteBarrier_ShadowCheckEnd_&rg
+
+WriteBarrier_NoShadow_&rg:
+ ; If we come here then we haven't written the value to the GC and need to.
+ ; ebp contains rg
+ ; We restore ebp/ecx immediately after this, and if either of them is the src
+ ; register it will regain its value as the src register.
+ mov DWORD PTR [edx], ebp
+WriteBarrier_ShadowCheckEnd_&rg:
+ pop ebp
+ pop ecx
+endif
+ cmp rg, g_ephemeral_low
+ jb WriteBarrier_NotInEphemeral_&rg
+ cmp rg, g_ephemeral_high
+ jae WriteBarrier_NotInEphemeral_&rg
+
+ shr edx, 10
+ add edx, [g_card_table]
+ cmp BYTE PTR [edx], 0FFh
+ jne WriteBarrier_UpdateCardTable_&rg
+ ret
+
+WriteBarrier_UpdateCardTable_&rg:
+ mov BYTE PTR [edx], 0FFh
+ ret
+
+WriteBarrier_NotInHeap_&rg:
+ ; If it wasn't in the heap then we haven't updated the dst in memory yet
+ mov DWORD PTR [edx], rg
+WriteBarrier_NotInEphemeral_&rg:
+ ; If it is in the GC Heap but isn't in the ephemeral range we've already
+ ; updated the Heap with the Object*.
+ ret
+_JIT_CheckedWriteBarrier&rg&@0 ENDP
+
+ENDM
+
+
+;***
+;JIT_ByRefWriteBarrier* - GC write barrier helper
+;
+;Purpose:
+; Helper calls in order to assign an object to a byref field
+; Enables book-keeping of the GC.
+;
+;Entry:
+; EDI - address of ref-field (assigned to)
+; ESI - address of the data (source)
+; ECX can be trashed
+;
+;Exit:
+;
+;Uses:
+; EDI and ESI are incremented by a DWORD
+;
+;Exceptions:
+;
+;*******************************************************************************
+
+; The code here is tightly coupled with AdjustContextForWriteBarrier, if you change
+; anything here, you might need to change AdjustContextForWriteBarrier as well
+
+ByRefWriteBarrierHelper MACRO
+ ALIGN 4
+PUBLIC _JIT_ByRefWriteBarrier@0
+_JIT_ByRefWriteBarrier@0 PROC
+ ;;test for dest in range
+ mov ecx, [esi]
+ cmp edi, g_lowest_address
+ jb ByRefWriteBarrier_NotInHeap
+ cmp edi, g_highest_address
+ jae ByRefWriteBarrier_NotInHeap
+
+ifndef WRITE_BARRIER_CHECK
+ ;;write barrier
+ mov [edi],ecx
+endif
+
+ifdef WRITE_BARRIER_CHECK
+ ; Test dest here so if it is bad AV would happen before we change register/stack
+ ; status. This makes job of AdjustContextForWriteBarrier easier.
+ cmp [edi], 0
+
+ ;; ALSO update the shadow GC heap if that is enabled
+
+ ; use edx for address in GC Shadow,
+ push edx
+
+ ;if g_GCShadow is 0, don't do the update
+ cmp g_GCShadow, 0
+ je ByRefWriteBarrier_NoShadow
+
+ mov edx, edi
+ sub edx, g_lowest_address ; U/V
+ jb ByRefWriteBarrier_NoShadow
+ add edx, [g_GCShadow]
+ cmp edx, [g_GCShadowEnd]
+ ja ByRefWriteBarrier_NoShadow
+
+ ; TODO: In Orcas timeframe if we move to P4+ only on X86 we should enable
+ ; mfence barriers on either side of these two writes to make sure that
+ ; they stay as close together as possible
+
+ ; edi contains address in GC
+ ; edx contains address in ShadowGC
+ ; ecx is the value to assign
+
+ ;; When we're writing to the shadow GC heap we want to be careful to minimize
+ ;; the risk of a race that can occur here where the GC and ShadowGC don't match
+ mov DWORD PTR [edi], ecx
+ mov DWORD PTR [edx], ecx
+
+ ;; We need a scratch register to verify the shadow heap. We also need to
+ ;; construct a memory barrier so that the write to the shadow heap happens
+ ;; before the read from the GC heap. We can do both by using SUB/XCHG
+ ;; rather than PUSH.
+ ;;
+ ;; TODO: Should be changed to a push if the mfence described above is added.
+ ;;
+ sub esp, 4
+ xchg [esp], eax
+
+ ;; As part of our race avoidance (see above) we will now check whether the values
+ ;; in the GC and ShadowGC match. There is a possibility that we're wrong here but
+ ;; being overaggressive means we might mask a case where someone updates GC refs
+ ;; without going to a write barrier, but by its nature it will be indeterminant
+ ;; and we will find real bugs whereas the current implementation is indeterminant
+ ;; but only leads to investigations that find that this code is fundamentally flawed
+
+ mov eax, [edi]
+ cmp [edx], eax
+ je ByRefWriteBarrier_CleanupShadowCheck
+ mov [edx], INVALIDGCVALUE
+ByRefWriteBarrier_CleanupShadowCheck:
+ pop eax
+ jmp ByRefWriteBarrier_ShadowCheckEnd
+
+ByRefWriteBarrier_NoShadow:
+ ; If we come here then we haven't written the value to the GC and need to.
+ mov DWORD PTR [edi], ecx
+
+ByRefWriteBarrier_ShadowCheckEnd:
+ pop edx
+endif
+ ;;test for *src in ephemeral segement
+ cmp ecx, g_ephemeral_low
+ jb ByRefWriteBarrier_NotInEphemeral
+ cmp ecx, g_ephemeral_high
+ jae ByRefWriteBarrier_NotInEphemeral
+
+ mov ecx, edi
+ add esi,4
+ add edi,4
+
+ shr ecx, 10
+ add ecx, [g_card_table]
+ cmp byte ptr [ecx], 0FFh
+ jne ByRefWriteBarrier_UpdateCardTable
+ ret
+ByRefWriteBarrier_UpdateCardTable:
+ mov byte ptr [ecx], 0FFh
+ ret
+
+ByRefWriteBarrier_NotInHeap:
+ ; If it wasn't in the heap then we haven't updated the dst in memory yet
+ mov [edi],ecx
+ByRefWriteBarrier_NotInEphemeral:
+ ; If it is in the GC Heap but isn't in the ephemeral range we've already
+ ; updated the Heap with the Object*.
+ add esi,4
+ add edi,4
+ ret
+_JIT_ByRefWriteBarrier@0 ENDP
+ENDM
+
+;*******************************************************************************
+; Write barrier wrappers with fcall calling convention
+;
+UniversalWriteBarrierHelper MACRO name
+ ALIGN 4
+PUBLIC @JIT_&name&@8
+@JIT_&name&@8 PROC
+ mov eax,edx
+ mov edx,ecx
+ jmp _JIT_&name&EAX@0
+@JIT_&name&@8 ENDP
+ENDM
+
+; WriteBarrierStart and WriteBarrierEnd are used to determine bounds of
+; WriteBarrier functions so can determine if got AV in them.
+;
+PUBLIC _JIT_WriteBarrierStart@0
+_JIT_WriteBarrierStart@0 PROC
+ret
+_JIT_WriteBarrierStart@0 ENDP
+
+ifdef FEATURE_USE_ASM_GC_WRITE_BARRIERS
+; Only define these if we're using the ASM GC write barriers; if this flag is not defined,
+; we'll use C++ versions of these write barriers.
+UniversalWriteBarrierHelper <CheckedWriteBarrier>
+UniversalWriteBarrierHelper <WriteBarrier>
+endif
+
+WriteBarrierHelper <EAX>
+WriteBarrierHelper <EBX>
+WriteBarrierHelper <ECX>
+WriteBarrierHelper <ESI>
+WriteBarrierHelper <EDI>
+WriteBarrierHelper <EBP>
+
+ByRefWriteBarrierHelper
+
+PUBLIC _JIT_WriteBarrierLast@0
+_JIT_WriteBarrierLast@0 PROC
+ret
+_JIT_WriteBarrierLast@0 ENDP
+
+; This is the first function outside the "keep together range". Used by BBT scripts.
+PUBLIC _JIT_WriteBarrierEnd@0
+_JIT_WriteBarrierEnd@0 PROC
+ret
+_JIT_WriteBarrierEnd@0 ENDP
+
+;*********************************************************************/
+; In cases where we support it we have an optimized GC Poll callback. Normall (when we're not trying to
+; suspend for GC, the CORINFO_HELP_POLL_GC helper points to this nop routine. When we're ready to suspend
+; for GC, we whack the Jit Helper table entry to point to the real helper. When we're done with GC we
+; whack it back.
+PUBLIC @JIT_PollGC_Nop@0
+@JIT_PollGC_Nop@0 PROC
+ret
+@JIT_PollGC_Nop@0 ENDP
+
+;*********************************************************************/
+;llshl - long shift left
+;
+;Purpose:
+; Does a Long Shift Left (signed and unsigned are identical)
+; Shifts a long left any number of bits.
+;
+; NOTE: This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+; EDX:EAX - long value to be shifted
+; ECX - number of bits to shift by
+;
+;Exit:
+; EDX:EAX - shifted value
+;
+ ALIGN 16
+PUBLIC JIT_LLsh
+JIT_LLsh PROC
+; Handle shifts of between bits 0 and 31
+ cmp ecx, 32
+ jae short LLshMORE32
+ shld edx,eax,cl
+ shl eax,cl
+ ret
+; Handle shifts of between bits 32 and 63
+LLshMORE32:
+ ; The x86 shift instructions only use the lower 5 bits.
+ mov edx,eax
+ xor eax,eax
+ shl edx,cl
+ ret
+JIT_LLsh ENDP
+
+
+;*********************************************************************/
+;LRsh - long shift right
+;
+;Purpose:
+; Does a signed Long Shift Right
+; Shifts a long right any number of bits.
+;
+; NOTE: This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+; EDX:EAX - long value to be shifted
+; ECX - number of bits to shift by
+;
+;Exit:
+; EDX:EAX - shifted value
+;
+ ALIGN 16
+PUBLIC JIT_LRsh
+JIT_LRsh PROC
+; Handle shifts of between bits 0 and 31
+ cmp ecx, 32
+ jae short LRshMORE32
+ shrd eax,edx,cl
+ sar edx,cl
+ ret
+; Handle shifts of between bits 32 and 63
+LRshMORE32:
+ ; The x86 shift instructions only use the lower 5 bits.
+ mov eax,edx
+ sar edx, 31
+ sar eax,cl
+ ret
+JIT_LRsh ENDP
+
+
+;*********************************************************************/
+; LRsz:
+;Purpose:
+; Does a unsigned Long Shift Right
+; Shifts a long right any number of bits.
+;
+; NOTE: This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+; EDX:EAX - long value to be shifted
+; ECX - number of bits to shift by
+;
+;Exit:
+; EDX:EAX - shifted value
+;
+ ALIGN 16
+PUBLIC JIT_LRsz
+JIT_LRsz PROC
+; Handle shifts of between bits 0 and 31
+ cmp ecx, 32
+ jae short LRszMORE32
+ shrd eax,edx,cl
+ shr edx,cl
+ ret
+; Handle shifts of between bits 32 and 63
+LRszMORE32:
+ ; The x86 shift instructions only use the lower 5 bits.
+ mov eax,edx
+ xor edx,edx
+ shr eax,cl
+ ret
+JIT_LRsz ENDP
+
+;*********************************************************************/
+; LMul:
+;Purpose:
+; Does a long multiply (same for signed/unsigned)
+;
+; NOTE: This routine has been adapted from the Microsoft CRTs.
+;
+;Entry:
+; Parameters are passed on the stack:
+; 1st pushed: multiplier (QWORD)
+; 2nd pushed: multiplicand (QWORD)
+;
+;Exit:
+; EDX:EAX - product of multiplier and multiplicand
+;
+ ALIGN 16
+PUBLIC JIT_LMul
+JIT_LMul PROC
+
+; AHI, BHI : upper 32 bits of A and B
+; ALO, BLO : lower 32 bits of A and B
+;
+; ALO * BLO
+; ALO * BHI
+; + BLO * AHI
+; ---------------------
+
+ mov eax,[esp + 8] ; AHI
+ mov ecx,[esp + 16] ; BHI
+ or ecx,eax ;test for both hiwords zero.
+ mov ecx,[esp + 12] ; BLO
+ jnz LMul_hard ;both are zero, just mult ALO and BLO
+
+ mov eax,[esp + 4]
+ mul ecx
+
+ ret 16 ; callee restores the stack
+
+LMul_hard:
+ push ebx
+
+ mul ecx ;eax has AHI, ecx has BLO, so AHI * BLO
+ mov ebx,eax ;save result
+
+ mov eax,[esp + 8] ; ALO
+ mul dword ptr [esp + 20] ;ALO * BHI
+ add ebx,eax ;ebx = ((ALO * BHI) + (AHI * BLO))
+
+ mov eax,[esp + 8] ; ALO ;ecx = BLO
+ mul ecx ;so edx:eax = ALO*BLO
+ add edx,ebx ;now edx has all the LO*HI stuff
+
+ pop ebx
+
+ ret 16 ; callee restores the stack
+
+JIT_LMul ENDP
+
+;*********************************************************************/
+; JIT_Dbl2LngOvf
+
+;Purpose:
+; converts a double to a long truncating toward zero (C semantics)
+; with check for overflow
+;
+; uses stdcall calling conventions
+;
+PUBLIC JIT_Dbl2LngOvf
+JIT_Dbl2LngOvf PROC
+ fnclex
+ fld qword ptr [esp+4]
+ push ecx
+ push ecx
+ fstp qword ptr [esp]
+ call JIT_Dbl2Lng
+ mov ecx,eax
+ fnstsw ax
+ test ax,01h
+ jnz Dbl2LngOvf_throw
+ mov eax,ecx
+ ret 8
+
+Dbl2LngOvf_throw:
+ mov ECX, CORINFO_OverflowException_ASM
+ call JIT_InternalThrowFromHelper
+ ret 8
+JIT_Dbl2LngOvf ENDP
+
+;*********************************************************************/
+; JIT_Dbl2Lng
+
+;Purpose:
+; converts a double to a long truncating toward zero (C semantics)
+;
+; uses stdcall calling conventions
+;
+; note that changing the rounding mode is very expensive. This
+; routine basiclly does the truncation sematics without changing
+; the rounding mode, resulting in a win.
+;
+PUBLIC JIT_Dbl2Lng
+JIT_Dbl2Lng PROC
+ fld qword ptr[ESP+4] ; fetch arg
+ lea ecx,[esp-8]
+ sub esp,16 ; allocate frame
+ and ecx,-8 ; align pointer on boundary of 8
+ fld st(0) ; duplciate top of stack
+ fistp qword ptr[ecx] ; leave arg on stack, also save in temp
+ fild qword ptr[ecx] ; arg, round(arg) now on stack
+ mov edx,[ecx+4] ; high dword of integer
+ mov eax,[ecx] ; low dword of integer
+ test eax,eax
+ je integer_QNaN_or_zero
+
+arg_is_not_integer_QNaN:
+ fsubp st(1),st ; TOS=d-round(d),
+ ; { st(1)=st(1)-st & pop ST }
+ test edx,edx ; what's sign of integer
+ jns positive
+ ; number is negative
+ ; dead cycle
+ ; dead cycle
+ fstp dword ptr[ecx] ; result of subtraction
+ mov ecx,[ecx] ; dword of difference(single precision)
+ add esp,16
+ xor ecx,80000000h
+ add ecx,7fffffffh ; if difference>0 then increment integer
+ adc eax,0 ; inc eax (add CARRY flag)
+ adc edx,0 ; propagate carry flag to upper bits
+ ret 8
+
+positive:
+ fstp dword ptr[ecx] ;17-18 ; result of subtraction
+ mov ecx,[ecx] ; dword of difference (single precision)
+ add esp,16
+ add ecx,7fffffffh ; if difference<0 then decrement integer
+ sbb eax,0 ; dec eax (subtract CARRY flag)
+ sbb edx,0 ; propagate carry flag to upper bits
+ ret 8
+
+integer_QNaN_or_zero:
+ test edx,7fffffffh
+ jnz arg_is_not_integer_QNaN
+ fstp st(0) ;; pop round(arg)
+ fstp st(0) ;; arg
+ add esp,16
+ ret 8
+JIT_Dbl2Lng ENDP
+
+;*********************************************************************/
+; JIT_Dbl2LngP4x87
+
+;Purpose:
+; converts a double to a long truncating toward zero (C semantics)
+;
+; uses stdcall calling conventions
+;
+; This code is faster on a P4 than the Dbl2Lng code above, but is
+; slower on a PIII. Hence we choose this code when on a P4 or above.
+;
+PUBLIC JIT_Dbl2LngP4x87
+JIT_Dbl2LngP4x87 PROC
+arg1 equ <[esp+0Ch]>
+
+ sub esp, 8 ; get some local space
+
+ fld qword ptr arg1 ; fetch arg
+ fnstcw word ptr arg1 ; store FPCW
+ movzx eax, word ptr arg1 ; zero extend - wide
+ or ah, 0Ch ; turn on OE and DE flags
+ mov dword ptr [esp], eax ; store new FPCW bits
+ fldcw word ptr [esp] ; reload FPCW with new bits
+ fistp qword ptr [esp] ; convert
+ mov eax, dword ptr [esp] ; reload FP result
+ mov edx, dword ptr [esp+4] ;
+ fldcw word ptr arg1 ; reload original FPCW value
+
+ add esp, 8 ; restore stack
+
+ ret 8
+JIT_Dbl2LngP4x87 ENDP
+
+;*********************************************************************/
+; JIT_Dbl2LngSSE3
+
+;Purpose:
+; converts a double to a long truncating toward zero (C semantics)
+;
+; uses stdcall calling conventions
+;
+; This code is faster than the above P4 x87 code for Intel processors
+; equal or later than Core2 and Atom that have SSE3 support
+;
+.686P
+.XMM
+PUBLIC JIT_Dbl2LngSSE3
+JIT_Dbl2LngSSE3 PROC
+arg1 equ <[esp+0Ch]>
+
+ sub esp, 8 ; get some local space
+
+ fld qword ptr arg1 ; fetch arg
+ fisttp qword ptr [esp] ; convert
+ mov eax, dword ptr [esp] ; reload FP result
+ mov edx, dword ptr [esp+4]
+
+ add esp, 8 ; restore stack
+
+ ret 8
+JIT_Dbl2LngSSE3 ENDP
+.586
+
+;*********************************************************************/
+; JIT_Dbl2IntSSE2
+
+;Purpose:
+; converts a double to a long truncating toward zero (C semantics)
+;
+; uses stdcall calling conventions
+;
+; This code is even faster than the P4 x87 code for Dbl2LongP4x87,
+; but only returns a 32 bit value (only good for int).
+;
+.686P
+.XMM
+PUBLIC JIT_Dbl2IntSSE2
+JIT_Dbl2IntSSE2 PROC
+ $movsd xmm0, [esp+4]
+ cvttsd2si eax, xmm0
+ ret 8
+JIT_Dbl2IntSSE2 ENDP
+.586
+
+
+;*********************************************************************/
+; This is the small write barrier thunk we use when we know the
+; ephemeral generation is higher in memory than older generations.
+; The 0x0F0F0F0F values are bashed by the two functions above.
+; This the generic version - wherever the code says ECX,
+; the specific register is patched later into a copy
+; Note: do not replace ECX by EAX - there is a smaller encoding for
+; the compares just for EAX, which won't work for other registers.
+;
+; READ THIS!!!!!!
+; it is imperative that the addresses of of the values that we overwrite
+; (card table, ephemeral region ranges, etc) are naturally aligned since
+; there are codepaths that will overwrite these values while the EE is running.
+;
+PUBLIC JIT_WriteBarrierReg_PreGrow
+JIT_WriteBarrierReg_PreGrow PROC
+ mov DWORD PTR [edx], ecx
+ cmp ecx, 0F0F0F0F0h
+ jb NoWriteBarrierPre
+
+ shr edx, 10
+ nop ; padding for alignment of constant
+ cmp byte ptr [edx+0F0F0F0F0h], 0FFh
+ jne WriteBarrierPre
+NoWriteBarrierPre:
+ ret
+ nop ; padding for alignment of constant
+ nop ; padding for alignment of constant
+WriteBarrierPre:
+ mov byte ptr [edx+0F0F0F0F0h], 0FFh
+ ret
+JIT_WriteBarrierReg_PreGrow ENDP
+
+;*********************************************************************/
+; This is the larger write barrier thunk we use when we know that older
+; generations may be higher in memory than the ephemeral generation
+; The 0x0F0F0F0F values are bashed by the two functions above.
+; This the generic version - wherever the code says ECX,
+; the specific register is patched later into a copy
+; Note: do not replace ECX by EAX - there is a smaller encoding for
+; the compares just for EAX, which won't work for other registers.
+; NOTE: we need this aligned for our validation to work properly
+ ALIGN 4
+PUBLIC JIT_WriteBarrierReg_PostGrow
+JIT_WriteBarrierReg_PostGrow PROC
+ mov DWORD PTR [edx], ecx
+ cmp ecx, 0F0F0F0F0h
+ jb NoWriteBarrierPost
+ cmp ecx, 0F0F0F0F0h
+ jae NoWriteBarrierPost
+
+ shr edx, 10
+ nop ; padding for alignment of constant
+ cmp byte ptr [edx+0F0F0F0F0h], 0FFh
+ jne WriteBarrierPost
+NoWriteBarrierPost:
+ ret
+ nop ; padding for alignment of constant
+ nop ; padding for alignment of constant
+WriteBarrierPost:
+ mov byte ptr [edx+0F0F0F0F0h], 0FFh
+ ret
+JIT_WriteBarrierReg_PostGrow ENDP
+
+;*********************************************************************/
+;
+
+ ; a fake virtual stub dispatch register indirect callsite
+ $nop3
+ call dword ptr [eax]
+
+
+PUBLIC JIT_TailCallReturnFromVSD
+JIT_TailCallReturnFromVSD:
+ifdef _DEBUG
+ nop ; blessed callsite
+endif
+ call VSDHelperLabel ; keep call-ret count balanced.
+VSDHelperLabel:
+
+; Stack at this point :
+; ...
+; m_ReturnAddress
+; m_regs
+; m_CallerAddress
+; m_pThread
+; vtbl
+; GSCookie
+; &VSDHelperLabel
+OffsetOfTailCallFrame = 8
+
+; ebx = pThread
+
+ifdef _DEBUG
+ mov esi, _s_gsCookie ; GetProcessGSCookie()
+ cmp dword ptr [esp+OffsetOfTailCallFrame-SIZEOF_GSCookie], esi
+ je TailCallFrameGSCookieIsValid
+ call @JIT_FailFast@0
+ TailCallFrameGSCookieIsValid:
+endif
+ ; remove the padding frame from the chain
+ mov esi, dword ptr [esp+OffsetOfTailCallFrame+4] ; esi = TailCallFrame::m_Next
+ mov dword ptr [ebx + Thread_m_pFrame], esi
+
+ ; skip the frame
+ add esp, 20 ; &VSDHelperLabel, GSCookie, vtbl, m_Next, m_CallerAddress
+
+ pop edi ; restore callee saved registers
+ pop esi
+ pop ebx
+ pop ebp
+
+ ret ; return to m_ReturnAddress
+
+;------------------------------------------------------------------------------
+;
+
+PUBLIC JIT_TailCall
+JIT_TailCall PROC
+
+; the stack layout at this point is:
+;
+; ebp+8+4*nOldStackArgs <- end of argument destination
+; ... ...
+; ebp+8+ old args (size is nOldStackArgs)
+; ... ...
+; ebp+8 <- start of argument destination
+; ebp+4 ret addr
+; ebp+0 saved ebp
+; ebp-c saved ebx, esi, edi (if have callee saved regs = 1)
+;
+; other stuff (local vars) in the jitted callers' frame
+;
+; esp+20+4*nNewStackArgs <- end of argument source
+; ... ...
+; esp+20+ new args (size is nNewStackArgs) to be passed to the target of the tail-call
+; ... ...
+; esp+20 <- start of argument source
+; esp+16 nOldStackArgs
+; esp+12 nNewStackArgs
+; esp+8 flags (1 = have callee saved regs, 2 = virtual stub dispatch)
+; esp+4 target addr
+; esp+0 retaddr
+;
+; If you change this function, make sure you update code:TailCallStubManager as well.
+
+RetAddr equ 0
+TargetAddr equ 4
+nNewStackArgs equ 12
+nOldStackArgs equ 16
+NewArgs equ 20
+
+; extra space is incremented as we push things on the stack along the way
+ExtraSpace = 0
+
+ call _GetThread@0; eax = Thread*
+ push eax ; Thread*
+
+ ; save ArgumentRegisters
+ push ecx
+ push edx
+
+ExtraSpace = 12 ; pThread, ecx, edx
+
+ifdef FEATURE_HIJACK
+ ; Make sure that the EE does have the return address patched. So we can move it around.
+ test dword ptr [eax+Thread_m_State], TS_Hijacked_ASM
+ jz NoHijack
+
+ ; JIT_TailCallHelper(Thread *)
+ push eax
+ call JIT_TailCallHelper ; this is __stdcall
+
+NoHijack:
+endif
+
+ mov edx, dword ptr [esp+ExtraSpace+JIT_TailCall_StackOffsetToFlags] ; edx = flags
+
+ mov eax, dword ptr [esp+ExtraSpace+nOldStackArgs] ; eax = nOldStackArgs
+ mov ecx, dword ptr [esp+ExtraSpace+nNewStackArgs] ; ecx = nNewStackArgs
+
+ ; restore callee saved registers
+ ; <TODO>@TODO : esp based - doesnt work with localloc</TODO>
+ test edx, 1
+ jz NoCalleeSaveRegisters
+
+ mov edi, dword ptr [ebp-4] ; restore edi
+ mov esi, dword ptr [ebp-8] ; restore esi
+ mov ebx, dword ptr [ebp-12] ; restore ebx
+
+NoCalleeSaveRegisters:
+
+ push dword ptr [ebp+4] ; save the original return address for later
+ push edi
+ push esi
+
+ExtraSpace = 24 ; pThread, ecx, edx, orig retaddr, edi, esi
+CallersEsi = 0
+CallersEdi = 4
+OrigRetAddr = 8
+pThread = 20
+
+ lea edi, [ebp+8+4*eax] ; edi = the end of argument destination
+ lea esi, [esp+ExtraSpace+NewArgs+4*ecx] ; esi = the end of argument source
+
+ mov ebp, dword ptr [ebp] ; restore ebp (do not use ebp as scratch register to get a good stack trace in debugger)
+
+ test edx, 2
+ jnz VSDTailCall
+
+ ; copy the arguments to the final destination
+ test ecx, ecx
+ jz ArgumentsCopied
+ArgumentCopyLoop:
+ ; At this point, this is the value of the registers :
+ ; edi = end of argument dest
+ ; esi = end of argument source
+ ; ecx = nNewStackArgs
+ mov eax, dword ptr [esi-4]
+ sub edi, 4
+ sub esi, 4
+ mov dword ptr [edi], eax
+ dec ecx
+ jnz ArgumentCopyLoop
+ArgumentsCopied:
+
+ ; edi = the start of argument destination
+
+ mov eax, dword ptr [esp+4+4] ; return address
+ mov ecx, dword ptr [esp+ExtraSpace+TargetAddr] ; target address
+
+ mov dword ptr [edi-4], eax ; return address
+ mov dword ptr [edi-8], ecx ; target address
+
+ lea eax, [edi-8] ; new value for esp
+
+ pop esi
+ pop edi
+ pop ecx ; skip original return address
+ pop edx
+ pop ecx
+
+ mov esp, eax
+
+PUBLIC JIT_TailCallLeave ; add a label here so that TailCallStubManager can access it
+JIT_TailCallLeave:
+ retn ; Will branch to targetAddr. This matches the
+ ; "call" done by JITted code, keeping the
+ ; call-ret count balanced.
+
+ ;----------------------------------------------------------------------
+VSDTailCall:
+ ;----------------------------------------------------------------------
+
+ ; For the Virtual Stub Dispatch, we create a fake callsite to fool
+ ; the callsite probes. In order to create the call site, we need to insert TailCallFrame
+ ; if we do not have one already.
+ ;
+ ; ecx = nNewStackArgs
+ ; esi = the end of argument source
+ ; edi = the end of argument destination
+ ;
+ ; The stub has pushed the following onto the stack at this point :
+ ; pThread, ecx, edx, orig retaddr, edi, esi
+
+
+ cmp dword ptr [esp+OrigRetAddr], JIT_TailCallReturnFromVSD
+ jz VSDTailCallFrameInserted_DoSlideUpArgs ; There is an exiting TailCallFrame that can be reused
+
+ ; try to allocate space for the frame / check whether there is enough space
+ ; If there is sufficient space, we will setup the frame and then slide
+ ; the arguments up the stack. Else, we first need to slide the arguments
+ ; down the stack to make space for the TailCallFrame
+ sub edi, (SIZEOF_GSCookie + SIZEOF_TailCallFrame)
+ cmp edi, esi
+ jae VSDSpaceForFrameChecked
+
+ ; There is not sufficient space to wedge in the TailCallFrame without
+ ; overwriting the new arguments.
+ ; We need to allocate the extra space on the stack,
+ ; and slide down the new arguments
+
+ mov eax, esi
+ sub eax, edi
+ sub esp, eax
+
+ mov eax, ecx ; to subtract the size of arguments
+ mov edx, ecx ; for counter
+
+ neg eax
+
+ ; copy down the arguments to the final destination, need to copy all temporary storage as well
+ add edx, (ExtraSpace+NewArgs)/4
+
+ lea esi, [esi+4*eax-(ExtraSpace+NewArgs)]
+ lea edi, [edi+4*eax-(ExtraSpace+NewArgs)]
+
+VSDAllocFrameCopyLoop:
+ mov eax, dword ptr [esi]
+ mov dword ptr [edi], eax
+ add esi, 4
+ add edi, 4
+ dec edx
+ jnz VSDAllocFrameCopyLoop
+
+ ; the argument source and destination are same now
+ mov esi, edi
+
+VSDSpaceForFrameChecked:
+
+ ; At this point, we have enough space on the stack for the TailCallFrame,
+ ; and we may already have slided down the arguments
+
+ mov eax, _s_gsCookie ; GetProcessGSCookie()
+ mov dword ptr [edi], eax ; set GSCookie
+ mov eax, _g_TailCallFrameVptr ; vptr
+ mov edx, dword ptr [esp+OrigRetAddr] ; orig return address
+ mov dword ptr [edi+SIZEOF_GSCookie], eax ; TailCallFrame::vptr
+ mov dword ptr [edi+SIZEOF_GSCookie+28], edx ; TailCallFrame::m_ReturnAddress
+
+ mov eax, dword ptr [esp+CallersEdi] ; restored edi
+ mov edx, dword ptr [esp+CallersEsi] ; restored esi
+ mov dword ptr [edi+SIZEOF_GSCookie+12], eax ; TailCallFrame::m_regs::edi
+ mov dword ptr [edi+SIZEOF_GSCookie+16], edx ; TailCallFrame::m_regs::esi
+ mov dword ptr [edi+SIZEOF_GSCookie+20], ebx ; TailCallFrame::m_regs::ebx
+ mov dword ptr [edi+SIZEOF_GSCookie+24], ebp ; TailCallFrame::m_regs::ebp
+
+ mov ebx, dword ptr [esp+pThread] ; ebx = pThread
+
+ mov eax, dword ptr [ebx+Thread_m_pFrame]
+ lea edx, [edi+SIZEOF_GSCookie]
+ mov dword ptr [edi+SIZEOF_GSCookie+4], eax ; TailCallFrame::m_pNext
+ mov dword ptr [ebx+Thread_m_pFrame], edx ; hook the new frame into the chain
+
+ ; setup ebp chain
+ lea ebp, [edi+SIZEOF_GSCookie+24] ; TailCallFrame::m_regs::ebp
+
+ ; Do not copy arguments again if they are in place already
+ ; Otherwise, we will need to slide the new arguments up the stack
+ cmp esi, edi
+ jne VSDTailCallFrameInserted_DoSlideUpArgs
+
+ ; At this point, we must have already previously slided down the new arguments,
+ ; or the TailCallFrame is a perfect fit
+ ; set the caller address
+ mov edx, dword ptr [esp+ExtraSpace+RetAddr] ; caller address
+ mov dword ptr [edi+SIZEOF_GSCookie+8], edx ; TailCallFrame::m_CallerAddress
+
+ ; adjust edi as it would by copying
+ neg ecx
+ lea edi, [edi+4*ecx]
+
+ jmp VSDArgumentsCopied
+
+VSDTailCallFrameInserted_DoSlideUpArgs:
+ ; set the caller address
+ mov edx, dword ptr [esp+ExtraSpace+RetAddr] ; caller address
+ mov dword ptr [edi+SIZEOF_GSCookie+8], edx ; TailCallFrame::m_CallerAddress
+
+ ; copy the arguments to the final destination
+ test ecx, ecx
+ jz VSDArgumentsCopied
+VSDArgumentCopyLoop:
+ mov eax, dword ptr [esi-4]
+ sub edi, 4
+ sub esi, 4
+ mov dword ptr [edi], eax
+ dec ecx
+ jnz VSDArgumentCopyLoop
+VSDArgumentsCopied:
+
+ ; edi = the start of argument destination
+
+ mov ecx, dword ptr [esp+ExtraSpace+TargetAddr] ; target address
+
+ mov dword ptr [edi-4], JIT_TailCallReturnFromVSD ; return address
+ mov dword ptr [edi-12], ecx ; address of indirection cell
+ mov ecx, [ecx]
+ mov dword ptr [edi-8], ecx ; target address
+
+ ; skip original return address and saved esi, edi
+ add esp, 12
+
+ pop edx
+ pop ecx
+
+ lea esp, [edi-12] ; new value for esp
+ pop eax
+
+PUBLIC JIT_TailCallVSDLeave ; add a label here so that TailCallStubManager can access it
+JIT_TailCallVSDLeave:
+ retn ; Will branch to targetAddr. This matches the
+ ; "call" done by JITted code, keeping the
+ ; call-ret count balanced.
+
+JIT_TailCall ENDP
+
+
+;------------------------------------------------------------------------------
+
+; HCIMPL2_VV(float, JIT_FltRem, float dividend, float divisor)
+@JIT_FltRem@8 proc public
+ fld dword ptr [esp+4] ; divisor
+ fld dword ptr [esp+8] ; dividend
+fremloop:
+ fprem
+ fstsw ax
+ fwait
+ sahf
+ jp fremloop ; Continue while the FPU status bit C2 is set
+ fxch ; swap, so divisor is on top and result is in st(1)
+ fstp ST(0) ; Pop the divisor from the FP stack
+ retn 8 ; Return value is in st(0)
+@JIT_FltRem@8 endp
+
+; HCIMPL2_VV(float, JIT_DblRem, float dividend, float divisor)
+@JIT_DblRem@16 proc public
+ fld qword ptr [esp+4] ; divisor
+ fld qword ptr [esp+12] ; dividend
+fremloopd:
+ fprem
+ fstsw ax
+ fwait
+ sahf
+ jp fremloopd ; Continue while the FPU status bit C2 is set
+ fxch ; swap, so divisor is on top and result is in st(1)
+ fstp ST(0) ; Pop the divisor from the FP stack
+ retn 16 ; Return value is in st(0)
+@JIT_DblRem@16 endp
+
+;------------------------------------------------------------------------------
+
+g_SystemInfo TEXTEQU <?g_SystemInfo@@3U_SYSTEM_INFO@@A>
+g_SpinConstants TEXTEQU <?g_SpinConstants@@3USpinConstants@@A>
+g_pSyncTable TEXTEQU <?g_pSyncTable@@3PAVSyncTableEntry@@A>
+JITutil_MonEnterWorker TEXTEQU <@JITutil_MonEnterWorker@4>
+JITutil_MonReliableEnter TEXTEQU <@JITutil_MonReliableEnter@8>
+JITutil_MonTryEnter TEXTEQU <@JITutil_MonTryEnter@12>
+JITutil_MonExitWorker TEXTEQU <@JITutil_MonExitWorker@4>
+JITutil_MonContention TEXTEQU <@JITutil_MonContention@4>
+JITutil_MonReliableContention TEXTEQU <@JITutil_MonReliableContention@8>
+JITutil_MonSignal TEXTEQU <@JITutil_MonSignal@4>
+JIT_InternalThrow TEXTEQU <@JIT_InternalThrow@4>
+EXTRN g_SystemInfo:BYTE
+EXTRN g_SpinConstants:BYTE
+EXTRN g_pSyncTable:DWORD
+EXTRN JITutil_MonEnterWorker:PROC
+EXTRN JITutil_MonReliableEnter:PROC
+EXTRN JITutil_MonTryEnter:PROC
+EXTRN JITutil_MonExitWorker:PROC
+EXTRN JITutil_MonContention:PROC
+EXTRN JITutil_MonReliableContention:PROC
+EXTRN JITutil_MonSignal:PROC
+EXTRN JIT_InternalThrow:PROC
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+EnterSyncHelper TEXTEQU <_EnterSyncHelper@8>
+LeaveSyncHelper TEXTEQU <_LeaveSyncHelper@8>
+EXTRN EnterSyncHelper:PROC
+EXTRN LeaveSyncHelper:PROC
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+
+; The following macro is needed because MASM returns
+; "instruction prefix not allowed" error message for
+; rep nop mnemonic
+$repnop MACRO
+ db 0F3h
+ db 090h
+ENDM
+
+; Safe ThreadAbort does not abort a thread if it is running finally or has lock counts.
+; At the time we call Monitor.Enter, we initiate the abort if we can.
+; We do not need to do the same for Monitor.Leave, since most of time, Monitor.Leave is called
+; during finally.
+
+;**********************************************************************
+; This is a frameless helper for entering a monitor on a object.
+; The object is in ARGUMENT_REG1. This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonEnterWorker@4 proc public
+ ; Initialize delay value for retry with exponential backoff
+ push ebx
+ mov ebx, dword ptr g_SpinConstants+SpinConstants_dwInitialDuration
+
+ ; We need yet another register to avoid refetching the thread object
+ push esi
+
+ ; Check if the instance is NULL.
+ test ARGUMENT_REG1, ARGUMENT_REG1
+ jz MonEnterFramedLockHelper
+
+ call _GetThread@0
+ mov esi,eax
+
+ ; Check if we can abort here
+ mov eax, [esi+Thread_m_State]
+ and eax, TS_CatchAtSafePoint_ASM
+ jz MonEnterRetryThinLock
+ ; go through the slow code path to initiate ThreadAbort.
+ jmp MonEnterFramedLockHelper
+
+MonEnterRetryThinLock:
+ ; Fetch the object header dword
+ mov eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+
+ ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit not set
+ ; SBLK_COMBINED_MASK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL
+ test eax, SBLK_COMBINED_MASK_ASM
+ jnz MonEnterNeedMoreTests
+
+ ; Everything is fine - get the thread id to store in the lock
+ mov edx, [esi+Thread_m_ThreadId]
+
+ ; If the thread id is too large, we need a syncblock for sure
+ cmp edx, SBLK_MASK_LOCK_THREADID_ASM
+ ja MonEnterFramedLockHelper
+
+ ; We want to store a new value with the current thread id set in the low 10 bits
+ or edx,eax
+ lock cmpxchg dword ptr [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+ jnz MonEnterPrepareToWaitThinLock
+
+ ; Everything went fine and we're done
+ add [esi+Thread_m_dwLockCount],1
+ pop esi
+ pop ebx
+ ret
+
+MonEnterNeedMoreTests:
+ ; Ok, it's not the simple case - find out which case it is
+ test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM
+ jnz MonEnterHaveHashOrSyncBlockIndex
+
+ ; The header is transitioning or the lock - treat this as if the lock was taken
+ test eax, BIT_SBLK_SPIN_LOCK_ASM
+ jnz MonEnterPrepareToWaitThinLock
+
+ ; Here we know we have the "thin lock" layout, but the lock is not free.
+ ; It could still be the recursion case - compare the thread id to check
+ mov edx,eax
+ and edx, SBLK_MASK_LOCK_THREADID_ASM
+ cmp edx, [esi+Thread_m_ThreadId]
+ jne MonEnterPrepareToWaitThinLock
+
+ ; Ok, the thread id matches, it's the recursion case.
+ ; Bump up the recursion level and check for overflow
+ lea edx, [eax+SBLK_LOCK_RECLEVEL_INC_ASM]
+ test edx, SBLK_MASK_LOCK_RECLEVEL_ASM
+ jz MonEnterFramedLockHelper
+
+ ; Try to put the new recursion level back. If the header was changed in the meantime,
+ ; we need a full retry, because the layout could have changed.
+ lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+ jnz MonEnterRetryHelperThinLock
+
+ ; Everything went fine and we're done
+ pop esi
+ pop ebx
+ ret
+
+MonEnterPrepareToWaitThinLock:
+ ; If we are on an MP system, we try spinning for a certain number of iterations
+ cmp dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+ jle MonEnterFramedLockHelper
+
+ ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+ mov eax, ebx
+MonEnterdelayLoopThinLock:
+ $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+ dec eax
+ jnz MonEnterdelayLoopThinLock
+
+ ; next time, wait a factor longer
+ imul ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+ cmp ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+ jle MonEnterRetryHelperThinLock
+
+ jmp MonEnterFramedLockHelper
+
+MonEnterRetryHelperThinLock:
+ jmp MonEnterRetryThinLock
+
+MonEnterHaveHashOrSyncBlockIndex:
+ ; If we have a hash code already, we need to create a sync block
+ test eax, BIT_SBLK_IS_HASHCODE_ASM
+ jnz MonEnterFramedLockHelper
+
+ ; Ok, we have a sync block index - just and out the top bits and grab the syncblock index
+ and eax, MASK_SYNCBLOCKINDEX_ASM
+
+ ; Get the sync block pointer.
+ mov ARGUMENT_REG2, dword ptr g_pSyncTable
+ mov ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]
+
+ ; Check if the sync block has been allocated.
+ test ARGUMENT_REG2, ARGUMENT_REG2
+ jz MonEnterFramedLockHelper
+
+ ; Get a pointer to the lock object.
+ lea ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]
+
+ ; Attempt to acquire the lock.
+MonEnterRetrySyncBlock:
+ mov eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+ test eax,eax
+ jne MonEnterHaveWaiters
+
+ ; Common case, lock isn't held and there are no waiters. Attempt to
+ ; gain ownership ourselves.
+ mov ARGUMENT_REG1,1
+ lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld], ARGUMENT_REG1
+ jnz MonEnterRetryHelperSyncBlock
+
+ ; Success. Save the thread object in the lock and increment the use count.
+ mov dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+ inc dword ptr [esi+Thread_m_dwLockCount]
+ inc dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG2 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ pop esi
+ pop ebx
+ ret
+
+ ; It's possible to get here with waiters but no lock held, but in this
+ ; case a signal is about to be fired which will wake up a waiter. So
+ ; for fairness sake we should wait too.
+ ; Check first for recursive lock attempts on the same thread.
+MonEnterHaveWaiters:
+ ; Is mutex already owned by current thread?
+ cmp [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+ jne MonEnterPrepareToWait
+
+ ; Yes, bump our use count.
+ inc dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG2 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ pop esi
+ pop ebx
+ ret
+
+MonEnterPrepareToWait:
+ ; If we are on an MP system, we try spinning for a certain number of iterations
+ cmp dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+ jle MonEnterHaveWaiters1
+
+ ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+ mov eax,ebx
+MonEnterdelayLoop:
+ $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+ dec eax
+ jnz MonEnterdelayLoop
+
+ ; next time, wait a factor longer
+ imul ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+ cmp ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+ jle MonEnterRetrySyncBlock
+
+MonEnterHaveWaiters1:
+
+ pop esi
+ pop ebx
+
+ ; Place AwareLock in arg1 then call contention helper.
+ mov ARGUMENT_REG1, ARGUMENT_REG2
+ jmp JITutil_MonContention
+
+MonEnterRetryHelperSyncBlock:
+ jmp MonEnterRetrySyncBlock
+
+ ; ECX has the object to synchronize on
+MonEnterFramedLockHelper:
+ pop esi
+ pop ebx
+ jmp JITutil_MonEnterWorker
+
+@JIT_MonEnterWorker@4 endp
+
+;**********************************************************************
+; This is a frameless helper for entering a monitor on a object, and
+; setting a flag to indicate that the lock was taken.
+; The object is in ARGUMENT_REG1. The flag is in ARGUMENT_REG2.
+; This tries the normal case (no blocking or object allocation) in line
+; and calls a framed helper for the other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonReliableEnter@8 proc public
+ ; Initialize delay value for retry with exponential backoff
+ push ebx
+ mov ebx, dword ptr g_SpinConstants+SpinConstants_dwInitialDuration
+
+ ; Put pbLockTaken in edi
+ push edi
+ mov edi, ARGUMENT_REG2
+
+ ; We need yet another register to avoid refetching the thread object
+ push esi
+
+ ; Check if the instance is NULL.
+ test ARGUMENT_REG1, ARGUMENT_REG1
+ jz MonReliableEnterFramedLockHelper
+
+ call _GetThread@0
+ mov esi,eax
+
+ ; Check if we can abort here
+ mov eax, [esi+Thread_m_State]
+ and eax, TS_CatchAtSafePoint_ASM
+ jz MonReliableEnterRetryThinLock
+ ; go through the slow code path to initiate ThreadAbort.
+ jmp MonReliableEnterFramedLockHelper
+
+MonReliableEnterRetryThinLock:
+ ; Fetch the object header dword
+ mov eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+
+ ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit not set
+ ; SBLK_COMBINED_MASK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL
+ test eax, SBLK_COMBINED_MASK_ASM
+ jnz MonReliableEnterNeedMoreTests
+
+ ; Everything is fine - get the thread id to store in the lock
+ mov edx, [esi+Thread_m_ThreadId]
+
+ ; If the thread id is too large, we need a syncblock for sure
+ cmp edx, SBLK_MASK_LOCK_THREADID_ASM
+ ja MonReliableEnterFramedLockHelper
+
+ ; We want to store a new value with the current thread id set in the low 10 bits
+ or edx,eax
+ lock cmpxchg dword ptr [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+ jnz MonReliableEnterPrepareToWaitThinLock
+
+ ; Everything went fine and we're done
+ add [esi+Thread_m_dwLockCount],1
+ ; Set *pbLockTaken=true
+ mov byte ptr [edi],1
+ pop esi
+ pop edi
+ pop ebx
+ ret
+
+MonReliableEnterNeedMoreTests:
+ ; Ok, it's not the simple case - find out which case it is
+ test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM
+ jnz MonReliableEnterHaveHashOrSyncBlockIndex
+
+ ; The header is transitioning or the lock - treat this as if the lock was taken
+ test eax, BIT_SBLK_SPIN_LOCK_ASM
+ jnz MonReliableEnterPrepareToWaitThinLock
+
+ ; Here we know we have the "thin lock" layout, but the lock is not free.
+ ; It could still be the recursion case - compare the thread id to check
+ mov edx,eax
+ and edx, SBLK_MASK_LOCK_THREADID_ASM
+ cmp edx, [esi+Thread_m_ThreadId]
+ jne MonReliableEnterPrepareToWaitThinLock
+
+ ; Ok, the thread id matches, it's the recursion case.
+ ; Bump up the recursion level and check for overflow
+ lea edx, [eax+SBLK_LOCK_RECLEVEL_INC_ASM]
+ test edx, SBLK_MASK_LOCK_RECLEVEL_ASM
+ jz MonReliableEnterFramedLockHelper
+
+ ; Try to put the new recursion level back. If the header was changed in the meantime,
+ ; we need a full retry, because the layout could have changed.
+ lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM], edx
+ jnz MonReliableEnterRetryHelperThinLock
+
+ ; Everything went fine and we're done
+ ; Set *pbLockTaken=true
+ mov byte ptr [edi],1
+ pop esi
+ pop edi
+ pop ebx
+ ret
+
+MonReliableEnterPrepareToWaitThinLock:
+ ; If we are on an MP system, we try spinning for a certain number of iterations
+ cmp dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+ jle MonReliableEnterFramedLockHelper
+
+ ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+ mov eax, ebx
+MonReliableEnterdelayLoopThinLock:
+ $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+ dec eax
+ jnz MonReliableEnterdelayLoopThinLock
+
+ ; next time, wait a factor longer
+ imul ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+ cmp ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+ jle MonReliableEnterRetryHelperThinLock
+
+ jmp MonReliableEnterFramedLockHelper
+
+MonReliableEnterRetryHelperThinLock:
+ jmp MonReliableEnterRetryThinLock
+
+MonReliableEnterHaveHashOrSyncBlockIndex:
+ ; If we have a hash code already, we need to create a sync block
+ test eax, BIT_SBLK_IS_HASHCODE_ASM
+ jnz MonReliableEnterFramedLockHelper
+
+ ; Ok, we have a sync block index - just and out the top bits and grab the syncblock index
+ and eax, MASK_SYNCBLOCKINDEX_ASM
+
+ ; Get the sync block pointer.
+ mov ARGUMENT_REG2, dword ptr g_pSyncTable
+ mov ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]
+
+ ; Check if the sync block has been allocated.
+ test ARGUMENT_REG2, ARGUMENT_REG2
+ jz MonReliableEnterFramedLockHelper
+
+ ; Get a pointer to the lock object.
+ lea ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]
+
+ ; Attempt to acquire the lock.
+MonReliableEnterRetrySyncBlock:
+ mov eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+ test eax,eax
+ jne MonReliableEnterHaveWaiters
+
+ ; Common case, lock isn't held and there are no waiters. Attempt to
+ ; gain ownership ourselves.
+ mov ARGUMENT_REG1,1
+ lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld], ARGUMENT_REG1
+ jnz MonReliableEnterRetryHelperSyncBlock
+
+ ; Success. Save the thread object in the lock and increment the use count.
+ mov dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+ inc dword ptr [esi+Thread_m_dwLockCount]
+ inc dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ ; Set *pbLockTaken=true
+ mov byte ptr [edi],1
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG2 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ pop esi
+ pop edi
+ pop ebx
+ ret
+
+ ; It's possible to get here with waiters but no lock held, but in this
+ ; case a signal is about to be fired which will wake up a waiter. So
+ ; for fairness sake we should wait too.
+ ; Check first for recursive lock attempts on the same thread.
+MonReliableEnterHaveWaiters:
+ ; Is mutex already owned by current thread?
+ cmp [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+ jne MonReliableEnterPrepareToWait
+
+ ; Yes, bump our use count.
+ inc dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ ; Set *pbLockTaken=true
+ mov byte ptr [edi],1
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG2 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ pop esi
+ pop edi
+ pop ebx
+ ret
+
+MonReliableEnterPrepareToWait:
+ ; If we are on an MP system, we try spinning for a certain number of iterations
+ cmp dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+ jle MonReliableEnterHaveWaiters1
+
+ ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+ mov eax,ebx
+MonReliableEnterdelayLoop:
+ $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+ dec eax
+ jnz MonReliableEnterdelayLoop
+
+ ; next time, wait a factor longer
+ imul ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+ cmp ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+ jle MonReliableEnterRetrySyncBlock
+
+MonReliableEnterHaveWaiters1:
+
+ ; Place AwareLock in arg1, pbLockTaken in arg2, then call contention helper.
+ mov ARGUMENT_REG1, ARGUMENT_REG2
+ mov ARGUMENT_REG2, edi
+
+ pop esi
+ pop edi
+ pop ebx
+
+ jmp JITutil_MonReliableContention
+
+MonReliableEnterRetryHelperSyncBlock:
+ jmp MonReliableEnterRetrySyncBlock
+
+ ; ECX has the object to synchronize on
+MonReliableEnterFramedLockHelper:
+ mov ARGUMENT_REG2, edi
+ pop esi
+ pop edi
+ pop ebx
+ jmp JITutil_MonReliableEnter
+
+@JIT_MonReliableEnter@8 endp
+
+;************************************************************************
+; This is a frameless helper for trying to enter a monitor on a object.
+; The object is in ARGUMENT_REG1 and a timeout in ARGUMENT_REG2. This tries the
+; normal case (no object allocation) in line and calls a framed helper for the
+; other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonTryEnter@12 proc public
+ ; Save the timeout parameter.
+ push ARGUMENT_REG2
+
+ ; Initialize delay value for retry with exponential backoff
+ push ebx
+ mov ebx, dword ptr g_SpinConstants+SpinConstants_dwInitialDuration
+
+ ; The thin lock logic needs another register to store the thread
+ push esi
+
+ ; Check if the instance is NULL.
+ test ARGUMENT_REG1, ARGUMENT_REG1
+ jz MonTryEnterFramedLockHelper
+
+ ; Check if the timeout looks valid
+ cmp ARGUMENT_REG2,-1
+ jl MonTryEnterFramedLockHelper
+
+ ; Get the thread right away, we'll need it in any case
+ call _GetThread@0
+ mov esi,eax
+
+ ; Check if we can abort here
+ mov eax, [esi+Thread_m_State]
+ and eax, TS_CatchAtSafePoint_ASM
+ jz MonTryEnterRetryThinLock
+ ; go through the slow code path to initiate ThreadAbort.
+ jmp MonTryEnterFramedLockHelper
+
+MonTryEnterRetryThinLock:
+ ; Get the header dword and check its layout
+ mov eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+
+ ; Check whether we have the "thin lock" layout, the lock is free and the spin lock bit not set
+ ; SBLK_COMBINED_MASK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK + SBLK_MASK_LOCK_THREADID + SBLK_MASK_LOCK_RECLEVEL
+ test eax, SBLK_COMBINED_MASK_ASM
+ jnz MonTryEnterNeedMoreTests
+
+ ; Ok, everything is fine. Fetch the thread id and make sure it's small enough for thin locks
+ mov edx, [esi+Thread_m_ThreadId]
+ cmp edx, SBLK_MASK_LOCK_THREADID_ASM
+ ja MonTryEnterFramedLockHelper
+
+ ; Try to put our thread id in there
+ or edx,eax
+ lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+ jnz MonTryEnterRetryHelperThinLock
+
+ ; Got the lock - everything is fine"
+ add [esi+Thread_m_dwLockCount],1
+ pop esi
+
+ ; Delay value no longer needed
+ pop ebx
+
+ ; Timeout parameter not needed, ditch it from the stack.
+ add esp,4
+
+ mov eax, [esp+4]
+ mov byte ptr [eax], 1
+ ret 4
+
+MonTryEnterNeedMoreTests:
+ ; Ok, it's not the simple case - find out which case it is
+ test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_ASM
+ jnz MonTryEnterHaveSyncBlockIndexOrHash
+
+ ; The header is transitioning or the lock is taken
+ test eax, BIT_SBLK_SPIN_LOCK_ASM
+ jnz MonTryEnterRetryHelperThinLock
+
+ mov edx, eax
+ and edx, SBLK_MASK_LOCK_THREADID_ASM
+ cmp edx, [esi+Thread_m_ThreadId]
+ jne MonTryEnterPrepareToWaitThinLock
+
+ ; Ok, the thread id matches, it's the recursion case.
+ ; Bump up the recursion level and check for overflow
+ lea edx, [eax+SBLK_LOCK_RECLEVEL_INC_ASM]
+ test edx, SBLK_MASK_LOCK_RECLEVEL_ASM
+ jz MonTryEnterFramedLockHelper
+
+ ; Try to put the new recursion level back. If the header was changed in the meantime,
+ ; we need a full retry, because the layout could have changed.
+ lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+ jnz MonTryEnterRetryHelperThinLock
+
+ ; Everything went fine and we're done
+ pop esi
+ pop ebx
+
+ ; Timeout parameter not needed, ditch it from the stack.
+ add esp, 4
+ mov eax, [esp+4]
+ mov byte ptr [eax], 1
+ ret 4
+
+MonTryEnterPrepareToWaitThinLock:
+ ; If we are on an MP system, we try spinning for a certain number of iterations
+ cmp dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+ jle MonTryEnterFramedLockHelper
+
+ ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+ mov eax, ebx
+MonTryEnterdelayLoopThinLock:
+ $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+ dec eax
+ jnz MonTryEnterdelayLoopThinLock
+
+ ; next time, wait a factor longer
+ imul ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+ cmp ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+ jle MonTryEnterRetryHelperThinLock
+
+ jmp MonTryEnterWouldBlock
+
+MonTryEnterRetryHelperThinLock:
+ jmp MonTryEnterRetryThinLock
+
+
+MonTryEnterHaveSyncBlockIndexOrHash:
+ ; If we have a hash code already, we need to create a sync block
+ test eax, BIT_SBLK_IS_HASHCODE_ASM
+ jnz MonTryEnterFramedLockHelper
+
+ ; Just and out the top bits and grab the syncblock index
+ and eax, MASK_SYNCBLOCKINDEX_ASM
+
+ ; Get the sync block pointer.
+ mov ARGUMENT_REG2, dword ptr g_pSyncTable
+ mov ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]
+
+ ; Check if the sync block has been allocated.
+ test ARGUMENT_REG2, ARGUMENT_REG2
+ jz MonTryEnterFramedLockHelper
+
+ ; Get a pointer to the lock object.
+ lea ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]
+
+MonTryEnterRetrySyncBlock:
+ ; Attempt to acquire the lock.
+ mov eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+ test eax,eax
+ jne MonTryEnterHaveWaiters
+
+ ; We need another scratch register for what follows, so save EBX now so"
+ ; we can use it for that purpose."
+ push ebx
+
+ ; Common case, lock isn't held and there are no waiters. Attempt to
+ ; gain ownership ourselves.
+ mov ebx,1
+ lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld],ebx
+
+ pop ebx
+
+ jnz MonTryEnterRetryHelperSyncBlock
+
+ ; Success. Save the thread object in the lock and increment the use count.
+ mov dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+ inc dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ inc dword ptr [esi+Thread_m_dwLockCount]
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG2 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+
+ pop esi
+ pop ebx
+
+ ; Timeout parameter not needed, ditch it from the stack."
+ add esp,4
+
+ mov eax, [esp+4]
+ mov byte ptr [eax], 1
+ ret 4
+
+ ; It's possible to get here with waiters but no lock held, but in this
+ ; case a signal is about to be fired which will wake up a waiter. So
+ ; for fairness sake we should wait too.
+ ; Check first for recursive lock attempts on the same thread.
+MonTryEnterHaveWaiters:
+ ; Is mutex already owned by current thread?
+ cmp [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+ jne MonTryEnterPrepareToWait
+
+ ; Yes, bump our use count.
+ inc dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG2 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ pop esi
+ pop ebx
+
+ ; Timeout parameter not needed, ditch it from the stack.
+ add esp,4
+
+ mov eax, [esp+4]
+ mov byte ptr [eax], 1
+ ret 4
+
+MonTryEnterPrepareToWait:
+ ; If we are on an MP system, we try spinning for a certain number of iterations
+ cmp dword ptr g_SystemInfo+SYSTEM_INFO_dwNumberOfProcessors,1
+ jle MonTryEnterWouldBlock
+
+ ; exponential backoff: delay by approximately 2*ebx clock cycles (on a PIII)
+ mov eax, ebx
+MonTryEnterdelayLoop:
+ $repnop ; indicate to the CPU that we are spin waiting (useful for some Intel P4 multiprocs)
+ dec eax
+ jnz MonTryEnterdelayLoop
+
+ ; next time, wait a factor longer
+ imul ebx, dword ptr g_SpinConstants+SpinConstants_dwBackoffFactor
+
+ cmp ebx, dword ptr g_SpinConstants+SpinConstants_dwMaximumDuration
+ jle MonTryEnterRetrySyncBlock
+
+ ; We would need to block to enter the section. Return failure if
+ ; timeout is zero, else call the framed helper to do the blocking
+ ; form of TryEnter."
+MonTryEnterWouldBlock:
+ pop esi
+ pop ebx
+ pop ARGUMENT_REG2
+ test ARGUMENT_REG2, ARGUMENT_REG2
+ jnz MonTryEnterBlock
+ mov eax, [esp+4]
+ mov byte ptr [eax], 0
+ ret 4
+
+MonTryEnterRetryHelperSyncBlock:
+ jmp MonTryEnterRetrySyncBlock
+
+MonTryEnterFramedLockHelper:
+ ; ARGUMENT_REG1 has the object to synchronize on, must retrieve the
+ ; timeout parameter from the stack.
+ pop esi
+ pop ebx
+ pop ARGUMENT_REG2
+MonTryEnterBlock:
+ jmp JITutil_MonTryEnter
+
+@JIT_MonTryEnter@12 endp
+
+;**********************************************************************
+; This is a frameless helper for exiting a monitor on a object.
+; The object is in ARGUMENT_REG1. This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonExitWorker@4 proc public
+ ; The thin lock logic needs an additional register to hold the thread, unfortunately
+ push esi
+
+ ; Check if the instance is NULL.
+ test ARGUMENT_REG1, ARGUMENT_REG1
+ jz MonExitFramedLockHelper
+
+ call _GetThread@0
+ mov esi,eax
+
+MonExitRetryThinLock:
+ ; Fetch the header dword and check its layout and the spin lock bit
+ mov eax, [ARGUMENT_REG1-SyncBlockIndexOffset_ASM]
+ ;BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_SPIN_LOCK_ASM = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX + BIT_SBLK_SPIN_LOCK
+ test eax, BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX_SPIN_LOCK_ASM
+ jnz MonExitNeedMoreTests
+
+ ; Ok, we have a "thin lock" layout - check whether the thread id matches
+ mov edx,eax
+ and edx, SBLK_MASK_LOCK_THREADID_ASM
+ cmp edx, [esi+Thread_m_ThreadId]
+ jne MonExitFramedLockHelper
+
+ ; Check the recursion level
+ test eax, SBLK_MASK_LOCK_RECLEVEL_ASM
+ jne MonExitDecRecursionLevel
+
+ ; It's zero - we're leaving the lock.
+ ; So try to put back a zero thread id.
+ ; edx and eax match in the thread id bits, and edx is zero elsewhere, so the xor is sufficient
+ xor edx,eax
+ lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+ jnz MonExitRetryHelperThinLock
+
+ ; We're done
+ sub [esi+Thread_m_dwLockCount],1
+ pop esi
+ ret
+
+MonExitDecRecursionLevel:
+ lea edx, [eax-SBLK_LOCK_RECLEVEL_INC_ASM]
+ lock cmpxchg [ARGUMENT_REG1-SyncBlockIndexOffset_ASM],edx
+ jnz MonExitRetryHelperThinLock
+
+ ; We're done
+ pop esi
+ ret
+
+MonExitNeedMoreTests:
+ ;Forward all special cases to the slow helper
+ ;BIT_SBLK_IS_HASHCODE_OR_SPIN_LOCK_ASM = BIT_SBLK_IS_HASHCODE + BIT_SBLK_SPIN_LOCK
+ test eax, BIT_SBLK_IS_HASHCODE_OR_SPIN_LOCK_ASM
+ jnz MonExitFramedLockHelper
+
+ ; Get the sync block index and use it to compute the sync block pointer
+ mov ARGUMENT_REG2, dword ptr g_pSyncTable
+ and eax, MASK_SYNCBLOCKINDEX_ASM
+ mov ARGUMENT_REG2, [ARGUMENT_REG2+eax*SizeOfSyncTableEntry_ASM+SyncTableEntry_m_SyncBlock]
+
+ ; was there a sync block?
+ test ARGUMENT_REG2, ARGUMENT_REG2
+ jz MonExitFramedLockHelper
+
+ ; Get a pointer to the lock object.
+ lea ARGUMENT_REG2, [ARGUMENT_REG2+SyncBlock_m_Monitor]
+
+ ; Check if lock is held.
+ cmp [ARGUMENT_REG2+AwareLock_m_HoldingThread],esi
+ jne MonExitFramedLockHelper
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG1 ; preserve regs
+ push ARGUMENT_REG2
+
+ push ARGUMENT_REG2 ; AwareLock
+ push [esp+8] ; return address
+ call LeaveSyncHelper
+
+ pop ARGUMENT_REG2 ; restore regs
+ pop ARGUMENT_REG1
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ ; Reduce our recursion count.
+ dec dword ptr [ARGUMENT_REG2+AwareLock_m_Recursion]
+ jz MonExitLastRecursion
+
+ pop esi
+ ret
+
+MonExitRetryHelperThinLock:
+ jmp MonExitRetryThinLock
+
+MonExitFramedLockHelper:
+ pop esi
+ jmp JITutil_MonExitWorker
+
+ ; This is the last count we held on this lock, so release the lock.
+MonExitLastRecursion:
+ dec dword ptr [esi+Thread_m_dwLockCount]
+ mov dword ptr [ARGUMENT_REG2+AwareLock_m_HoldingThread],0
+
+MonExitRetry:
+ mov eax, [ARGUMENT_REG2+AwareLock_m_MonitorHeld]
+ lea esi, [eax-1]
+ lock cmpxchg [ARGUMENT_REG2+AwareLock_m_MonitorHeld], esi
+ jne MonExitRetryHelper
+ pop esi
+ test eax,0FFFFFFFEh
+ jne MonExitMustSignal
+
+ ret
+
+MonExitMustSignal:
+ mov ARGUMENT_REG1, ARGUMENT_REG2
+ jmp JITutil_MonSignal
+
+MonExitRetryHelper:
+ jmp MonExitRetry
+
+@JIT_MonExitWorker@4 endp
+
+;**********************************************************************
+; This is a frameless helper for entering a static monitor on a class.
+; The methoddesc is in ARGUMENT_REG1. This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; Note we are changing the methoddesc parameter to a pointer to the
+; AwareLock.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonEnterStatic@4 proc public
+ ; We need another scratch register for what follows, so save EBX now so
+ ; we can use it for that purpose.
+ push ebx
+
+ ; Attempt to acquire the lock
+MonEnterStaticRetry:
+ mov eax, [ARGUMENT_REG1+AwareLock_m_MonitorHeld]
+ test eax,eax
+ jne MonEnterStaticHaveWaiters
+
+ ; Common case, lock isn't held and there are no waiters. Attempt to
+ ; gain ownership ourselves.
+ mov ebx,1
+ lock cmpxchg [ARGUMENT_REG1+AwareLock_m_MonitorHeld],ebx
+ jnz MonEnterStaticRetryHelper
+
+ pop ebx
+
+ ; Success. Save the thread object in the lock and increment the use count.
+ call _GetThread@0
+ mov [ARGUMENT_REG1+AwareLock_m_HoldingThread], eax
+ inc dword ptr [ARGUMENT_REG1+AwareLock_m_Recursion]
+ inc dword ptr [eax+Thread_m_dwLockCount]
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG1 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ ret
+
+ ; It's possible to get here with waiters but no lock held, but in this
+ ; case a signal is about to be fired which will wake up a waiter. So
+ ; for fairness sake we should wait too.
+ ; Check first for recursive lock attempts on the same thread.
+MonEnterStaticHaveWaiters:
+ ; Get thread but preserve EAX (contains cached contents of m_MonitorHeld).
+ push eax
+ call _GetThread@0
+ mov ebx,eax
+ pop eax
+
+ ; Is mutex already owned by current thread?
+ cmp [ARGUMENT_REG1+AwareLock_m_HoldingThread],ebx
+ jne MonEnterStaticPrepareToWait
+
+ ; Yes, bump our use count.
+ inc dword ptr [ARGUMENT_REG1+AwareLock_m_Recursion]
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG1 ; AwareLock
+ push [esp+4] ; return address
+ call EnterSyncHelper
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+ pop ebx
+ ret
+
+MonEnterStaticPrepareToWait:
+ pop ebx
+
+ ; ARGUMENT_REG1 should have AwareLock. Call contention helper.
+ jmp JITutil_MonContention
+
+MonEnterStaticRetryHelper:
+ jmp MonEnterStaticRetry
+@JIT_MonEnterStatic@4 endp
+
+;**********************************************************************
+; A frameless helper for exiting a static monitor on a class.
+; The methoddesc is in ARGUMENT_REG1. This tries the normal case (no
+; blocking or object allocation) in line and calls a framed helper
+; for the other cases.
+; Note we are changing the methoddesc parameter to a pointer to the
+; AwareLock.
+; ***** NOTE: if you make any changes to this routine, build with MON_DEBUG undefined
+; to make sure you don't break the non-debug build. This is very fragile code.
+; Also, propagate the changes to jithelp.s which contains the same helper and assembly code
+; (in AT&T syntax) for gnu assembler.
+@JIT_MonExitStatic@4 proc public
+
+ifdef MON_DEBUG
+ifdef TRACK_SYNC
+ push ARGUMENT_REG1 ; preserve regs
+
+ push ARGUMENT_REG1 ; AwareLock
+ push [esp+8] ; return address
+ call LeaveSyncHelper
+
+ pop [ARGUMENT_REG1] ; restore regs
+endif ;TRACK_SYNC
+endif ;MON_DEBUG
+
+ ; Check if lock is held.
+ call _GetThread@0
+ cmp [ARGUMENT_REG1+AwareLock_m_HoldingThread],eax
+ jne MonExitStaticLockError
+
+ ; Reduce our recursion count.
+ dec dword ptr [ARGUMENT_REG1+AwareLock_m_Recursion]
+ jz MonExitStaticLastRecursion
+
+ ret
+
+ ; This is the last count we held on this lock, so release the lock.
+MonExitStaticLastRecursion:
+ ; eax must have the thread object
+ dec dword ptr [eax+Thread_m_dwLockCount]
+ mov dword ptr [ARGUMENT_REG1+AwareLock_m_HoldingThread],0
+ push ebx
+
+MonExitStaticRetry:
+ mov eax, [ARGUMENT_REG1+AwareLock_m_MonitorHeld]
+ lea ebx, [eax-1]
+ lock cmpxchg [ARGUMENT_REG1+AwareLock_m_MonitorHeld],ebx
+ jne MonExitStaticRetryHelper
+ pop ebx
+ test eax,0FFFFFFFEh
+ jne MonExitStaticMustSignal
+
+ ret
+
+MonExitStaticMustSignal:
+ jmp JITutil_MonSignal
+
+MonExitStaticRetryHelper:
+ jmp MonExitStaticRetry
+ ; Throw a synchronization lock exception.
+MonExitStaticLockError:
+ mov ARGUMENT_REG1, CORINFO_SynchronizationLockException_ASM
+ jmp JIT_InternalThrow
+
+@JIT_MonExitStatic@4 endp
+
+; PatchedCodeStart and PatchedCodeEnd are used to determine bounds of patched code.
+;
+
+_JIT_PatchedCodeStart@0 proc public
+ret
+_JIT_PatchedCodeStart@0 endp
+
+;
+; Optimized TLS getters
+;
+
+ ALIGN 4
+
+ifndef FEATURE_IMPLICIT_TLS
+_GetThread@0 proc public
+ ; This will be overwritten at runtime with optimized GetThread implementation
+ jmp short _GetTLSDummy@0
+ ; Just allocate space that will be filled in at runtime
+ db (TLS_GETTER_MAX_SIZE_ASM - 2) DUP (0CCh)
+_GetThread@0 endp
+
+ ALIGN 4
+
+_GetAppDomain@0 proc public
+ ; This will be overwritten at runtime with optimized GetAppDomain implementation
+ jmp short _GetTLSDummy@0
+ ; Just allocate space that will be filled in at runtime
+ db (TLS_GETTER_MAX_SIZE_ASM - 2) DUP (0CCh)
+_GetAppDomain@0 endp
+
+_GetTLSDummy@0 proc public
+ xor eax,eax
+ ret
+_GetTLSDummy@0 endp
+
+ ALIGN 4
+
+_ClrFlsGetBlock@0 proc public
+ ; This will be overwritten at runtime with optimized ClrFlsGetBlock implementation
+ jmp short _GetTLSDummy@0
+ ; Just allocate space that will be filled in at runtime
+ db (TLS_GETTER_MAX_SIZE_ASM - 2) DUP (0CCh)
+_ClrFlsGetBlock@0 endp
+endif
+
+;**********************************************************************
+; Write barriers generated at runtime
+
+PUBLIC _JIT_PatchedWriteBarrierStart@0
+_JIT_PatchedWriteBarrierStart@0 PROC
+ret
+_JIT_PatchedWriteBarrierStart@0 ENDP
+
+PatchedWriteBarrierHelper MACRO rg
+ ALIGN 8
+PUBLIC _JIT_WriteBarrier&rg&@0
+_JIT_WriteBarrier&rg&@0 PROC
+ ; Just allocate space that will be filled in at runtime
+ db (48) DUP (0CCh)
+_JIT_WriteBarrier&rg&@0 ENDP
+
+ENDM
+
+PatchedWriteBarrierHelper <EAX>
+PatchedWriteBarrierHelper <EBX>
+PatchedWriteBarrierHelper <ECX>
+PatchedWriteBarrierHelper <ESI>
+PatchedWriteBarrierHelper <EDI>
+PatchedWriteBarrierHelper <EBP>
+
+PUBLIC _JIT_PatchedWriteBarrierLast@0
+_JIT_PatchedWriteBarrierLast@0 PROC
+ret
+_JIT_PatchedWriteBarrierLast@0 ENDP
+
+;**********************************************************************
+; PrecodeRemotingThunk is patched at runtime to activate it
+ifdef FEATURE_REMOTING
+ ALIGN 16
+_PrecodeRemotingThunk@0 proc public
+
+ ret ; This is going to be patched to "test ecx,ecx"
+ nop
+
+ jz RemotingDone ; predicted not taken
+
+ cmp dword ptr [ecx],11111111h ; This is going to be patched to address of the transparent proxy
+ je RemotingCheck ; predicted not taken
+
+RemotingDone:
+ ret
+
+RemotingCheck:
+ push eax ; save method desc
+ mov eax, dword ptr [ecx + TransparentProxyObject___stubData]
+ call [ecx + TransparentProxyObject___stub]
+ test eax, eax
+ jnz RemotingCtxMismatch
+ mov eax, [esp]
+ mov ax, [eax + MethodDesc_m_wFlags]
+ and ax, MethodDesc_mdcClassification
+ cmp ax, MethodDesc_mcComInterop
+ je ComPlusCall
+ pop eax ; throw away method desc
+ jmp RemotingDone
+
+RemotingCtxMismatch:
+ pop eax ; restore method desc
+ add esp, 4 ; pop return address into the precode
+ jmp _TransparentProxyStub_CrossContext@0
+
+ComPlusCall:
+ pop eax ; restore method desc
+ mov [esp],eax ; replace return address into the precode with method desc (argument for TP stub)
+ jmp _InContextTPQuickDispatchAsmStub@0
+
+_PrecodeRemotingThunk@0 endp
+endif ; FEATURE_REMOTING
+
+_JIT_PatchedCodeLast@0 proc public
+ret
+_JIT_PatchedCodeLast@0 endp
+
+; This is the first function outside the "keep together range". Used by BBT scripts.
+_JIT_PatchedCodeEnd@0 proc public
+ret
+_JIT_PatchedCodeEnd@0 endp
+
+; This is the ASM portion of JIT_IsInstanceOfInterface. For all the bizarre cases, it quickly
+; fails and falls back on the JITutil_IsInstanceOfAny helper. So all failure cases take
+; the slow path, too.
+;
+; ARGUMENT_REG1 = array or interface to check for.
+; ARGUMENT_REG2 = instance to be cast.
+
+ ALIGN 16
+PUBLIC @JIT_IsInstanceOfInterface@8
+@JIT_IsInstanceOfInterface@8 PROC
+ test ARGUMENT_REG2, ARGUMENT_REG2
+ jz IsNullInst
+
+ mov eax, [ARGUMENT_REG2] ; get MethodTable
+
+ push ebx
+ push esi
+ movzx ebx, word ptr [eax+MethodTable_m_wNumInterfaces]
+
+ ; check if this MT implements any interfaces
+ test ebx, ebx
+ jz IsInstanceOfInterfaceDoBizarre
+
+ ; move Interface map ptr into eax
+ mov eax, [eax+MethodTable_m_pInterfaceMap]
+
+IsInstanceOfInterfaceTop:
+ ; eax -> current InterfaceInfo_t entry in interface map list
+ifdef FEATURE_PREJIT
+ mov esi, [eax]
+ test esi, 1
+ ; Move the deference out of line so that this jump is correctly predicted for the case
+ ; when there is no indirection
+ jnz IsInstanceOfInterfaceIndir
+ cmp ARGUMENT_REG1, esi
+else
+ cmp ARGUMENT_REG1, [eax]
+endif
+ je IsInstanceOfInterfaceFound
+
+IsInstanceOfInterfaceNext:
+ add eax, SIZEOF_InterfaceInfo_t
+ dec ebx
+ jnz IsInstanceOfInterfaceTop
+
+ ; fall through to DoBizarre
+
+IsInstanceOfInterfaceDoBizarre:
+ pop esi
+ pop ebx
+ mov eax, [ARGUMENT_REG2] ; get MethodTable
+ test dword ptr [eax+MethodTable_m_dwFlags], NonTrivialInterfaceCastFlags
+ jnz IsInstanceOfInterfaceNonTrivialCast
+
+IsNullInst:
+ xor eax,eax
+ ret
+
+ifdef FEATURE_PREJIT
+IsInstanceOfInterfaceIndir:
+ cmp ARGUMENT_REG1,[esi-1]
+ jne IsInstanceOfInterfaceNext
+endif
+
+IsInstanceOfInterfaceFound:
+ pop esi
+ pop ebx
+ mov eax, ARGUMENT_REG2 ; the successful instance
+ ret
+
+IsInstanceOfInterfaceNonTrivialCast:
+ jmp @JITutil_IsInstanceOfInterface@8
+
+@JIT_IsInstanceOfInterface@8 endp
+
+; This is the ASM portion of JIT_ChkCastInterface. For all the bizarre cases, it quickly
+; fails and falls back on the JITutil_ChkCastAny helper. So all failure cases take
+; the slow path, too.
+;
+; ARGUMENT_REG1 = array or interface to check for.
+; ARGUMENT_REG2 = instance to be cast.
+
+ ALIGN 16
+PUBLIC @JIT_ChkCastInterface@8
+@JIT_ChkCastInterface@8 PROC
+ test ARGUMENT_REG2, ARGUMENT_REG2
+ jz ChkCastInterfaceIsNullInst
+
+ mov eax, [ARGUMENT_REG2] ; get MethodTable
+
+ push ebx
+ push esi
+ movzx ebx, word ptr [eax+MethodTable_m_wNumInterfaces]
+
+ ; speculatively move Interface map ptr into eax
+ mov eax, [eax+MethodTable_m_pInterfaceMap]
+
+ ; check if this MT implements any interfaces
+ test ebx, ebx
+ jz ChkCastInterfaceDoBizarre
+
+ChkCastInterfaceTop:
+ ; eax -> current InterfaceInfo_t entry in interface map list
+ifdef FEATURE_PREJIT
+ mov esi, [eax]
+ test esi, 1
+ ; Move the deference out of line so that this jump is correctly predicted for the case
+ ; when there is no indirection
+ jnz ChkCastInterfaceIndir
+ cmp ARGUMENT_REG1, esi
+else
+ cmp ARGUMENT_REG1, [eax]
+endif
+ je ChkCastInterfaceFound
+
+ChkCastInterfaceNext:
+ add eax, SIZEOF_InterfaceInfo_t
+ dec ebx
+ jnz ChkCastInterfaceTop
+
+ ; fall through to DoBizarre
+
+ChkCastInterfaceDoBizarre:
+ pop esi
+ pop ebx
+ jmp @JITutil_ChkCastInterface@8
+
+ifdef FEATURE_PREJIT
+ChkCastInterfaceIndir:
+ cmp ARGUMENT_REG1,[esi-1]
+ jne ChkCastInterfaceNext
+endif
+
+ChkCastInterfaceFound:
+ pop esi
+ pop ebx
+
+ChkCastInterfaceIsNullInst:
+ mov eax, ARGUMENT_REG2 ; either null, or the successful instance
+ ret
+
+@JIT_ChkCastInterface@8 endp
+
+ end